1 Introduction

This webbook contains all the code used for data analysis in study of the population-level metagenomic data of Podarcis muralis lizards across elevational gradients in various mountain ranges of the Pyrenees.

1.1 Prepare the R environment

1.1.1 Environment

To reproduce all the analyses locally, clone this repository in your computer using:

RStudio > New Project > Version Control > Git

And indicating the following git repository:

https://github.com/alberdilab/elevational_hologenomics.git

Once the R project has been created, follow the instructions and code chunks shown in this webbook.

1.1.2 Libraries

The following R packages are required for the data analysis.

# Base
library(R.utils)
library(knitr)
library(tidyverse)
library(devtools)
library(tinytable)

# For tree handling
library(ape)
library(phyloseq)
library(phytools)

# For plotting
library(ggplot2)
library(ggrepel)
library(ggpubr)
library(ggnewscale)
library(gridExtra)
library(ggtreeExtra)
library(ggtree)
library(ggh4x)

# For statistics
library(spaa)
library(vegan)
library(Rtsne)
library(geiger)
library(hilldiv2)
library(distillR)
library(broom.mixed)
library(Hmsc)
library(corrplot)

2 Prepare data

2.1 Load data

Load the original data files outputted by the bioinformatic pipeline.

2.1.1 Sample metadata

sample_metadata <- read.csv("data/Pyrenees_metadata_all_v2.csv",sep=",",header=T)%>%
  filter(EHI_number != "EHI00102") %>%
  filter(EHI_number != "EHI00182") %>%
  filter(EHI_number !="EHI00435") %>%
  filter(EHI_number !="EHI00126") #genome not P.muralis

2.1.2 Read counts

read_counts <- read_tsv("data/DMB0113_counts.tsv") %>%
  rename(genome = 1) %>%
  select(-EHI00102, -EHI00182,-EHI00435,-EHI00126) #remove samples

2.1.3 Genome taxonomy

genome_metadata <- read_tsv("data/DMB0113_mag_info.tsv") %>%
    rename(length=mag_size)

2.1.4 Genome base hits

genome_coverage <- read_tsv("data/DMB0113_coverage.tsv") %>%
  rename(genome = 1) %>%
  select(-EHI00102, -EHI00182,-EHI00435,-EHI00126) %>% #remove samples
  semi_join(genome_metadata, by = "genome")

2.1.5 Genome tree

genome_tree <- read_tree("data/DMB0113.tree")
genome_tree$tip.label <- str_replace_all(genome_tree$tip.label,"'", "") #remove single quotes in MAG names
genome_tree <- keep.tip(genome_tree, tip=genome_metadata$genome) # keep only MAG tips

2.1.6 Genome annotations

Distill annotations already into GIFTs

genome_gifts_raw="data/GIFTs.tsv"
genome_gifts <- read.table(genome_gifts_raw,header=T, sep="\t", row.names=1)

2.2 Create working objects

Transform the original data files into working objects for downstream analyses.

2.2.1 Filter reads by coverage

read_counts <- read_counts %>%
  semi_join(genome_metadata, by = "genome")

min_coverage=0.3
read_counts_filt <- genome_coverage %>%
  mutate(across(where(is.numeric), ~ ifelse(. > min_coverage, 1, 0))) %>%
  mutate(across(-1, ~ . * read_counts[[cur_column()]])) 

2.2.2 Transform reads into genome counts

readlength=150
genome_counts <- read_counts %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata$length / readlength) ))
readlength=150
genome_counts_filt <- read_counts_filt %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata$length / readlength) ))

2.3 Prepare color scheme

AlberdiLab projects use unified color schemes developed for the Earth Hologenome Initiative, to facilitate figure interpretation.

phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree$tip.label)) %>%
    select(phylum, colors) %>% 
    unique() %>%
    arrange(phylum) %>%
    pull(colors, name=phylum)

2.4 Wrap working objects

All working objects are wrapped into a single Rdata object to facilitate downstream usage.

save(sample_metadata, 
     genome_metadata, 
     read_counts, 
     genome_counts, 
     genome_counts_filt, 
     genome_tree,
    genome_gifts, 
     phylum_colors,
     file = "data/data.Rdata")

3 Data summary

load("data/data.Rdata")

Summary of sampled individuals and analysed faecal samples.

#number of samples
length(sample_metadata$EHI_number)
[1] 105
#number of samples by transect
sample_metadata %>%
  group_by(Transect) %>%
  summarise(n_samples = length(EHI_number)) %>%
  tt()
tinytable_wjog79e29zzebhlciprf
Transect n_samples
Aisa 22
Aran 37
Sentein 19
Tourmalet 27
#number of samples by transect and elevation
sample_metadata %>%
  group_by(Transect, Elevation) %>%
  summarise(n_samples = length(EHI_number)) %>%
  tt()
tinytable_2swrkdzgbzgc91tr6dr7
Transect Elevation n_samples
Aisa 1250 6
Aisa 1450 6
Aisa 1650 4
Aisa 1850 6
Aran 1000 6
Aran 1080 7
Aran 1340 5
Aran 1500 6
Aran 1650 7
Aran 1850 6
Sentein 941 5
Sentein 1260 4
Sentein 1628 5
Sentein 1873 5
Tourmalet 953 5
Tourmalet 1255 4
Tourmalet 1561 4
Tourmalet 1797 4
Tourmalet 2065 2
Tourmalet 2134 3
Tourmalet 2306 5
#n of analysed faecal samples
ncol(read_counts)
[1] 106

Geographical location of sampled lizards in the Pyrenees.

#Summarise for generating map
options(dplyr.summarise.inform = FALSE)
sample_metadata_summary <- sample_metadata %>%
  #Group by geography and count samples
  select(EHI_number, latitude, longitude, Transect) %>%
  group_by(latitude, longitude, Transect) %>%
  summarize(count = n()) %>%
  ungroup()

#plotting on map
## Determine the longitude and latitude ranges
lon_range <- range(sample_metadata_summary$longitude, na.rm = TRUE)
lat_range <- range(sample_metadata_summary$latitude, na.rm = TRUE)

sample_metadata_summary %>%
  ggplot(.) +
  #render map
  geom_map(
    data=map_data("world"),
    map = map_data("world"),
    aes(long, lat, map_id=region),
    color = "white", fill = "#cccccc", linewidth = 0.2
  ) +
  #render points
  geom_point(
    aes(x=longitude,y=latitude, color=Transect),
    alpha=0.5, shape=16) +
  #add general plot layout
  theme_minimal() +
  theme(legend.position = "right",
        axis.title.x=element_blank(),
        axis.title.y=element_blank()
  ) + coord_map("mercator", xlim = lon_range, ylim = lat_range)

4 Data statistics

4.1 Sequencing reads statistics

sample_metadata %>% 
    summarise(Total=sum(reads_post_fastp * 150 / 1000000000) %>% round(2), 
              mean=mean(reads_post_fastp * 150 / 1000000000) %>% round(2),
              sd=sd(reads_post_fastp * 150 / 1000000000) %>% round(2)) %>%
    unite("Average",mean, sd, sep = " ± ", remove = TRUE) %>%
    tt()
tinytable_ai2h7b6jpl1qcgilajzz
Total Average
652.04 6.21 ± 2.77

4.2 Sequencing depth

sequencing_depth <- read_counts %>%
  column_to_rownames(var = "genome") %>%
  colSums()

4.3 DNA fractions

sequence_fractions <- read_counts %>%
  pivot_longer(-genome, names_to = "sample", values_to = "value") %>%
  group_by(sample) %>%
  summarise(mags = sum(value)) %>%
    left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
    select(sample,mags,metagenomic_bases,host_bases,bases_lost_fastp_percent) %>%
    mutate(mags_bases = mags*146) %>%
    mutate(lowqual_bases = ((metagenomic_bases+host_bases)/(1-bases_lost_fastp_percent))-(metagenomic_bases+host_bases)) %>%
    mutate(unmapped_bases = metagenomic_bases - mags_bases) %>%
    mutate(unmapped_bases = ifelse(unmapped_bases < 0, 0, unmapped_bases)) %>%
    select(sample, lowqual_bases, host_bases, unmapped_bases, mags_bases)

sequence_fractions %>%
  mutate_at(vars(-sample), ~./1000000000) %>%
  rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
  tt()
tinytable_bm3ev92ygx9pk0i75ibj
Sample Low quality Mapped to host Unmapped Mapped to MAGs
EHI00069 1.1441521 0.920532571 2.0517957 4.1058263
EHI00070 0.2348919 0.937599880 0.8332809 1.6364476
EHI00072 0.3717090 0.078319449 0.9507999 4.0430716
EHI00073 0.1504762 0.007300164 0.6458066 1.8453666
EHI00074 0.5700275 0.034671349 1.7906564 5.7526051
EHI00075 0.2702205 0.444390387 0.9599202 2.4896265
EHI00076 0.3350547 0.081391903 1.4172925 3.8389468
EHI00077 0.1353495 0.641770627 0.3885242 1.3947384
EHI00079 0.3377906 2.721037061 0.5653554 0.8084465
EHI00080 0.7291964 1.605187153 2.3611523 2.6883254
EHI00081 0.2184180 0.727448090 0.5615342 1.3358755
EHI00085 0.2300087 0.212261834 0.5665817 1.9281604
EHI00086 0.3471176 0.417593612 1.0770869 1.7440344
EHI00088 0.2278663 0.071769210 0.5556301 3.0141044
EHI00089 0.1751910 0.034915171 0.5162821 1.9576155
EHI00091 0.2622745 0.230039482 0.8255133 2.4924602
EHI00092 0.3917724 0.170842450 1.0255541 4.9181624
EHI00093 0.3725739 0.199633318 1.1938025 2.3137473
EHI00095 0.1768550 0.255990148 0.3449738 1.9242079
EHI00097 0.5249214 0.366551383 0.8028468 2.9685729
EHI00098 0.3176395 0.040047629 1.7271586 3.1498120
EHI00100 0.4386907 2.398470758 0.8228622 4.1093643
EHI00101 0.7257361 0.451928365 4.9291736 1.9943098
EHI00103 0.2036111 0.843529516 0.5714195 1.3395113
EHI00104 0.1769983 0.643701669 0.6807099 1.2574891
EHI00105 0.2711051 0.005694785 0.9087805 2.7311372
EHI00106 1.1082626 1.058626752 1.8709694 2.0746217
EHI00107 0.3818355 0.184257358 1.5836979 3.0592637
EHI00108 0.3420046 0.394171066 0.5356632 2.7441687
EHI00110 0.3648881 0.591747269 0.5588077 2.6145261
EHI00111 0.4392502 0.151954279 1.1992388 3.5773008
EHI00112 0.7134927 0.085183185 1.1328903 3.6469985
EHI00113 0.8620893 0.617092903 1.4939464 7.3254896
EHI00114 0.4127445 0.087174427 2.3985081 4.6641659
EHI00115 0.6290438 0.062352261 2.6081814 3.0085937
EHI00116 0.4130247 0.039596686 0.8559983 4.7270478
EHI00117 0.6850197 0.423816725 1.2454024 7.0782364
EHI00118 0.3963952 0.784391517 0.9049477 3.7006953
EHI00119 0.4448016 0.038727564 1.5277021 4.6761692
EHI00120 0.3549466 0.051038023 0.7165254 3.8122875
EHI00121 0.2487524 1.196143747 0.9172490 1.9448859
EHI00122 0.3967333 0.056247498 1.7140526 5.3880487
EHI00124 0.3101617 0.113171041 1.6986936 3.2732918
EHI00125 0.3008464 0.313488992 1.2381544 3.1620784
EHI00128 0.3688688 0.149336152 1.0558709 2.9648899
EHI00129 0.2827708 0.212726094 0.7352845 2.7738704
EHI00130 0.2917071 0.082344877 1.0170695 2.1151693
EHI00131 0.1940832 0.943415382 0.4863996 1.2048589
EHI00133 0.2569782 0.429506925 0.3484713 3.3747982
EHI00134 0.5620395 1.988142143 1.9520876 3.9427347
EHI00137 0.2948339 0.250489910 1.0341318 2.4844583
EHI00138 0.2673241 0.179238573 0.5826993 2.8578460
EHI00139 0.3001940 0.034225947 0.9413847 2.1426906
EHI00176 0.7041777 0.100761385 0.7196926 2.7465205
EHI00177 0.2555111 0.016053624 0.5228890 2.1639513
EHI00178 0.2415179 0.045338274 1.3634101 2.6038297
EHI00179 0.1816121 0.023148016 0.9401620 2.3926575
EHI00180 0.5363481 0.037200326 0.7786810 2.4699635
EHI00181 0.1978972 0.301399301 0.4733087 2.4456488
EHI00422 0.1660426 0.058188268 0.4631427 2.3301737
EHI00426 0.6048076 0.374841259 1.6091057 5.6754435
EHI00428 0.4361908 0.354862015 0.9866519 3.7380622
EHI00433 0.6941201 0.730117194 1.6017157 3.7066518
EHI00438 0.2938182 0.100587490 0.8490354 4.7546269
EHI00441 0.5217550 0.202212815 2.3199030 6.8309541
EHI00451 0.4696437 0.318898071 1.2315115 5.0794188
EHI00456 0.5747753 0.097152439 0.6337192 4.7496272
EHI00458 0.5541280 0.326249839 1.7364884 4.9185106
EHI00462 0.5139086 0.330809687 1.3870295 4.8485639
EHI00464 0.5604449 0.962080506 1.5487451 5.0533364
EHI00465 0.1355889 0.041795770 0.7088416 1.4262076
EHI00467 0.4291035 0.854761759 1.6444666 3.7590494
EHI00470 0.6625583 1.679223689 1.8675738 8.1840405
EHI00472 0.3619652 0.586418138 1.8074071 3.3732380
EHI00473 0.4841385 0.477433785 1.2481742 4.8043613
EHI00477 0.4782444 0.087653441 2.7186490 3.7317223
EHI00479 0.8395454 0.074374376 1.9936025 12.8156007
EHI00480 0.7475415 0.793212202 3.1481051 4.3553735
EHI00481 0.5040521 0.094726409 1.3739735 5.3792865
EHI00483 0.4545170 1.532171008 0.8653232 4.7402692
EHI00484 0.3037426 0.269250378 1.3870200 5.2563238
EHI00488 0.4900356 2.212642337 2.0850126 4.3632065
EHI00490 0.6234248 0.180662818 3.5993286 7.8297890
EHI00493 0.4026458 2.030328521 2.7098633 2.4780854
EHI00496 0.4536382 0.415085470 1.7045368 6.1699650
EHI00499 3.5142773 2.204637514 3.4291586 3.2083353
EHI00504 0.5335873 2.167943278 1.2709452 4.9663386
EHI00506 0.7953759 0.639271744 1.7418071 8.0883653
EHI00507 0.4077748 0.136492692 1.5474048 5.5083429
EHI00512 0.6299478 8.067540048 0.5964127 1.3433168
EHI00514 0.4590217 0.336676348 1.5023882 6.2571308
EHI00518 0.4082296 0.012341521 1.0122951 5.8379043
EHI00524 0.4918121 1.320091255 2.0577845 4.8996823
EHI00525 0.3238390 0.105964164 1.1420157 5.0531429
EHI00527 0.5623443 0.128178226 2.3107744 7.6106934
EHI00529 0.3350620 0.129930230 1.3102065 5.1130080
EHI00536 0.4725239 0.010876654 1.2534820 7.7176039
EHI00537 0.3665586 0.089579032 2.4631156 1.4492223
EHI00538 0.4501201 2.638414328 0.6397030 3.1637601
EHI00541 0.7086242 0.152318595 1.4153324 6.8350770
EHI00547 1.0225445 2.648359289 7.0526142 6.1760904
EHI00566 0.5706734 0.218664855 1.7237625 7.3779017
EHI00567 0.4565067 0.133011864 1.9051503 4.4466414
EHI00568 0.5246686 0.219110833 1.7989251 3.5919103
EHI00569 0.6859077 0.169999886 2.6761990 6.7957645
sequence_fractions %>%
    pivot_longer(!sample, names_to = "fraction", values_to = "value") %>%
    mutate(value = value / 1000000000) %>%
    mutate(fraction = factor(fraction, levels = c("lowqual_bases","host_bases","unmapped_bases","mags_bases"))) %>%
    ggplot(., aes(x = sample, y = value, fill=fraction)) +
        geom_bar(position="stack", stat = "identity") +
      scale_fill_manual(name="Sequence type",
                    breaks=c("lowqual_bases","host_bases","unmapped_bases","mags_bases"),
                    labels=c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
                    values=c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c"))+
        labs(x = "Samples", y = "Amount of data (GB)") +
        theme_classic() +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "bottom")

4.4 Recovered microbial fraction

singlem_table <- sequence_fractions %>%
    mutate(mags_proportion = round((mags_bases / (mags_bases + unmapped_bases))*100,2)) %>%
    left_join(sample_metadata, by = join_by(sample == EHI_number))  %>%
    mutate(singlem_proportion = round(singlem_fraction*100,2)) %>%
    select(sample,mags_proportion,singlem_proportion) %>%
    mutate(mags_proportion = ifelse(singlem_proportion == 0, 0, mags_proportion)) %>% #convert zeros to NA
    mutate(singlem_proportion = ifelse(singlem_proportion == 0, NA, singlem_proportion)) %>% #convert zeros to NA
    mutate(singlem_proportion = ifelse(singlem_proportion < mags_proportion, NA, singlem_proportion)) %>% #if singlem is smaller, then NA, to simplify plot
    mutate(singlem_proportion = ifelse(singlem_proportion > 100, 100, singlem_proportion)) #simplify

singlem_table %>%
    pivot_longer(!sample, names_to = "proportion", values_to = "value") %>%
    left_join(sample_metadata, by = join_by(sample == EHI_number))  %>%
    mutate(proportion = factor(proportion, levels = c("mags_proportion","singlem_proportion"))) %>%
    ggplot(., aes(x = value, y = sample, color=proportion)) +
            geom_line(aes(group = sample), color = "#f8a538") +
            geom_point() +
      scale_color_manual(name="Proportion",
                    breaks=c("mags_proportion","singlem_proportion"),
                    labels=c("Recovered","Estimated"),
                    values=c("#52e1e8", "#876b53"))+
      facet_nested(species + sample_type ~ ., scales="free",space="free")+
            theme_classic() +
            labs(y = "Samples", x = "Prokaryotic fraction (%)") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),
              legend.position = "right",
              strip.background.y=element_rect(color = NA, fill= "#f4f4f4"))

5 MAG catalogue

load("data/data.Rdata")

5.1 Genome phylogeny

# Generate the phylum color heatmap
phylum_heatmap <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree$tip.label)) %>%
    select(genome,phylum) %>%
    mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
    column_to_rownames(var = "genome")

# Generate  basal tree
circular_tree <- force.ultrametric(genome_tree, method="extend") %>% # extend to ultrametric for the sake of visualisation
    ggtree(., layout="fan", open.angle=10, size=0.5)
***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************
# Add phylum ring
circular_tree <- gheatmap(circular_tree, phylum_heatmap, offset=0.55, width=0.1, colnames=FALSE) +
        scale_fill_manual(values=phylum_colors) +
        geom_tiplab2(size=1, hjust=-0.1) +
        theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0))

# Flush color scale to enable a new color scheme in the next ring
circular_tree <- circular_tree + new_scale_fill()

# Add completeness ring
circular_tree <- circular_tree +
        new_scale_fill() +
        scale_fill_gradient(low = "#d1f4ba", high = "#f4baba") +
        geom_fruit(
                data=genome_metadata,
                geom=geom_bar,
                mapping = aes(x=completeness, y=genome, fill=contamination),
                offset = 0.55,
                orientation="y",
              stat="identity")

# Add genome-size ring
circular_tree <-  circular_tree +
        new_scale_fill() +
        scale_fill_manual(values = "#cccccc") +
        geom_fruit(
             data=genome_metadata,
             geom=geom_bar,
             mapping = aes(x=length, y=genome),
                 offset = 0.05,
                 orientation="y",
         stat="identity")

# Add text
circular_tree <-  circular_tree +
        annotate('text', x=3.4, y=0, label='            Phylum', family='arial', size=3.5) +
        annotate('text', x=4.3, y=0, label='                         Genome quality', family='arial',size=3.5)+
        annotate('text', x=4.8, y=0, label='                     Genome size', family='arial', size=3.5)

#Plot circular tree
circular_tree %>% open_tree(30) %>% rotate_tree(90)

5.2 Genome quality

genome_metadata %>% 
    summarise(completeness_mean=mean(completeness) %>% round(2) %>% as.character(), 
              completeness_sd=sd(completeness) %>% round(2) %>% as.character(), 
              contamination_mean=mean(contamination) %>% round(2), 
              contamination_sd=sd(contamination) %>% round(2)) %>%
    unite("Completeness",completeness_mean, completeness_sd, sep = " ± ", remove = TRUE) %>%
    unite("Contamination",contamination_mean, contamination_sd, sep = " ± ", remove = TRUE) %>%
    tt()
tinytable_egtl4wkwaccuz9p9oas9
Completeness Contamination
79.99 ± 15.48 2.19 ± 1.94
#Generate quality biplot
genome_biplot <- genome_metadata %>%
  select(c(genome,domain,phylum,completeness,contamination,length)) %>%
  arrange(match(genome, rev(genome_tree$tip.label))) %>% #sort MAGs according to phylogenetic tree
  ggplot(aes(x=completeness,y=contamination,size=length,color=phylum)) +
              geom_point(alpha=0.7) +
                    ylim(c(10,0)) +
                    scale_color_manual(values=phylum_colors) +
                    labs(y= "Contamination", x = "Completeness") +
                    theme_classic() +
                    theme(legend.position = "none")

#Generate contamination boxplot
genome_contamination <- genome_metadata %>%
            ggplot(aes(y=contamination)) +
                    ylim(c(10,0)) +
                    geom_boxplot(colour = "#999999", fill="#cccccc") +
                    theme_void() +
                    theme(legend.position = "none",
                        axis.title.x = element_blank(),
                        axis.title.y = element_blank(),
                        axis.text.y=element_blank(),
                        axis.ticks.y=element_blank(),
                        axis.text.x=element_blank(),
                        axis.ticks.x=element_blank(),
                        plot.margin = unit(c(0, 0, 0.40, 0),"inches")) #add bottom-margin (top, right, bottom, left)

#Generate completeness boxplot
genome_completeness <- genome_metadata %>%
        ggplot(aes(x=completeness)) +
                xlim(c(50,100)) +
                geom_boxplot(colour = "#999999", fill="#cccccc") +
                theme_void() +
                theme(legend.position = "none",
                    axis.title.x = element_blank(),
                    axis.title.y = element_blank(),
                    axis.text.y=element_blank(),
                    axis.ticks.y=element_blank(),
                    axis.text.x=element_blank(),
                    axis.ticks.x=element_blank(),
                    plot.margin = unit(c(0, 0, 0, 0.50),"inches")) #add left-margin (top, right, bottom, left)

#Render composite figure
grid.arrange(grobs = list(genome_completeness,genome_biplot,genome_contamination),
        layout_matrix = rbind(c(1,1,1,1,1,1,1,1,1,1,1,4),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3)))

5.3 Functional overview

# Aggregate basal GIFT into elements
function_table <- genome_gifts %>%
    to.elements(., GIFT_db)

# Generate  basal tree
function_tree <- force.ultrametric(genome_tree, method="extend") %>%
                ggtree(., size = 0.3) 
***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************
#Add phylum colors next to the tree tips
function_tree <- gheatmap(function_tree, phylum_heatmap, offset=0, width=0.1, colnames=FALSE) +
            scale_fill_manual(values=phylum_colors) +
            labs(fill="Phylum")

#Reset fill scale to use a different colour profile in the heatmap
function_tree <- function_tree + new_scale_fill()

#Add functions heatmap
function_tree <- gheatmap(function_tree, function_table, offset=0.5, width=3.5, colnames=FALSE) +
            vexpand(.08) +
            coord_cartesian(clip = "off") +
            scale_fill_gradient(low = "#f4f4f4", high = "steelblue", na.value="white") +
            labs(fill="GIFT")

#Reset fill scale to use a different colour profile in the heatmap
function_tree <- function_tree + new_scale_fill()

# Add completeness barplots
function_tree <- function_tree +
            geom_fruit(data=genome_metadata,
            geom=geom_bar,
            grid.params=list(axis="x", text.size=2, nbreak = 1),
            axis.params=list(vline=TRUE),
            mapping = aes(x=length, y=genome, fill=completeness),
                 offset = 3.8,
                 orientation="y",
                 stat="identity") +
            scale_fill_gradient(low = "#cf8888", high = "#a2cc87") +
            labs(fill="Genome\ncompleteness")

function_tree

5.4 Functional ordination

# Generate the tSNE ordination
tSNE_function <- Rtsne(X=function_table, dims = 2, check_duplicates = FALSE)

# Plot the ordination
function_ordination <- tSNE_function$Y %>%
                as.data.frame() %>%
                mutate(genome=rownames(function_table)) %>%
                inner_join(genome_metadata, by="genome") %>%
                rename(tSNE1="V1", tSNE2="V2") %>%
                select(genome,phylum,tSNE1,tSNE2, length) %>%
                ggplot(aes(x = tSNE1, y = tSNE2, color = phylum, size=length))+
                            geom_point(shape=16, alpha=0.7) +
                            scale_color_manual(values=phylum_colors) +
                            theme_minimal() +
                labs(color="Phylum", size="Genome size") +
                guides(color = guide_legend(override.aes = list(size = 5))) # enlarge Phylum dots in legend

function_ordination

5.5 MAGs shared across transects

genome_counts_rel <- genome_counts_filt_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>%
  column_to_rownames(., "genome")
genome_counts_rel_pa=1*(genome_counts_rel>0)

table_upset_analysis_cont=t(aggregate(t(genome_counts_rel_pa),by=list(sample_metadata$Transect),FUN=sum)[,-1])
colnames(table_upset_analysis_cont)=levels(as.factor(sample_metadata$Transect))
table_upset_analysis=(table_upset_analysis_cont>0)*1
table_upset_analysis=data.frame(table_upset_analysis)
table_upset_analysis=apply(table_upset_analysis,2,as.integer)
rownames(table_upset_analysis) <- rownames(genome_counts_rel_pa)

locationcolors=c("#e5bd5b", "#6b7398","#e2815a", "#876b96")
upset(as.data.frame(table_upset_analysis),
      keep.order = T,
      sets = rev(c("Aisa","Aran","Sentein","Tourmalet")),
      sets.bar.color= rev(locationcolors),
      mb.ratio = c(0.55, 0.45), order.by = "freq")

6 Community composition

load("data/data.Rdata")

6.1 Taxonomy overview

6.1.1 Stacked barplot

genome_counts_filt_met<-genome_counts_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS normalisation
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
  left_join(., genome_metadata, by = join_by(genome == genome)) %>% #append genome metadata
  left_join(., sample_metadata, by = join_by(sample == EHI_number)) %>% #append sample metadata
  filter(count > 0) #filter 0 counts

genome_counts_filt_met$Elevation<-as.factor(genome_counts_filt_met$Elevation)
# Create an interaction variable for elevation and sample
genome_counts_filt_met$interaction_var <- interaction(genome_counts_filt_met$sample, genome_counts_filt_met$Elevation)

# Plot stacked barplot
ggplot(genome_counts_filt_met, aes(x=interaction_var,y=count,fill=phylum, group=phylum))+ #grouping enables keeping the same sorting of taxonomic units
    geom_bar(stat="identity", colour="white", linewidth=0.1)+ #plot stacked bars with white borders
    scale_fill_manual(values=phylum_colors) +
    labs(y = "Relative abundance", x="Elevation (m)") +
    guides(fill = guide_legend(ncol = 3)) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
          panel.background = element_blank(),
          panel.border = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          axis.line = element_line(linewidth = 0.5, linetype = "solid", colour = "black"),
          legend.position = "top",
    legend.title = element_blank(),
    legend.text = element_text(size=7))+
    scale_x_discrete(labels = function(x) gsub(".*\\.", "", x)) +
    facet_wrap(~Transect, scales = "free", labeller = as_labeller(function(label) gsub(".*\\.", "", label))) #only show elevation label

6.1.2 Phylum relative abundances

phylum_summary <- genome_counts_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS normalisation
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>%
  left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
  left_join(genome_metadata, by = join_by(genome == genome)) %>%
  group_by(sample,phylum) %>%
  summarise(relabun=sum(count))

phylum_summary %>%
    group_by(phylum) %>%
    summarise(total_mean=mean(relabun*100, na.rm=T),
              total_sd=sd(relabun*100, na.rm=T))  %>%
    mutate(total=str_c(round(total_mean,2),"±",round(total_sd,2))) %>% 
    arrange(-total_mean) %>% 
    dplyr::select(phylum,total) %>% 
    tt()
tinytable_bbb107twaop4adasyz2f
phylum total
p__Bacillota_A 40.6±15.93
p__Bacteroidota 38.3±15.88
p__Pseudomonadota 5.83±7.87
p__Bacillota 4.59±6.37
p__Campylobacterota 3.47±5.63
p__Desulfobacterota 2.68±4.05
p__Verrucomicrobiota 1.53±2.43
p__Bacillota_C 0.95±1.05
p__Fusobacteriota 0.91±3.7
p__Cyanobacteriota 0.4±0.6
p__Spirochaetota 0.22±0.97
p__Bacillota_B 0.19±0.32
p__Actinomycetota 0.17±0.52
p__Chlamydiota 0.1±0.61
p__Deferribacterota 0.05±0.27
p__Synergistota 0.01±0.08
phylum_arrange <- phylum_summary %>%
    group_by(phylum) %>%
    summarise(mean=mean(relabun)) %>%
    arrange(-mean) %>%
    select(phylum) %>%
    pull()

phylum_summary %>%
    filter(phylum %in% phylum_arrange) %>%
    mutate(phylum=factor(phylum,levels=rev(phylum_arrange))) %>%
    ggplot(aes(x=relabun, y=phylum, group=phylum, color=phylum)) +
        scale_color_manual(values=phylum_colors[rev(phylum_arrange)]) +
        geom_jitter(alpha=0.5) + 
        theme_minimal()

6.2 Taxonomy boxplot

6.2.1 Family

family_summary <- genome_counts_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS normalisation
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
  left_join(sample_metadata, by = join_by(sample == EHI_number)) %>% #append sample metadata
  left_join(., genome_metadata, by = join_by(genome == genome)) %>% #append genome metadata
  group_by(sample,family) %>%
  summarise(relabun=sum(count))

family_summary %>%
    group_by(family) %>%
    summarise(mean=mean(relabun),sd=sd(relabun)) %>%
    arrange(-mean) %>%
    tt()
tinytable_v31qg72jd14vx03dqgu7
family mean sd
f__Lachnospiraceae 2.798160e-01 0.1323151296
f__Bacteroidaceae 2.132576e-01 0.1045524501
f__Tannerellaceae 1.122881e-01 0.0756060991
f__ 6.556348e-02 0.0820773598
f__Helicobacteraceae 3.474580e-02 0.0562678198
f__Marinifilaceae 3.441228e-02 0.0255200299
f__UBA3700 2.777694e-02 0.0330290932
f__Desulfovibrionaceae 2.680077e-02 0.0404627923
f__Ruminococcaceae 2.435712e-02 0.0226880677
f__Rikenellaceae 1.900710e-02 0.0176572117
f__Erysipelotrichaceae 1.767814e-02 0.0269603016
f__Oscillospiraceae 1.696763e-02 0.0140960977
f__Coprobacillaceae 1.273032e-02 0.0336927310
f__Mycoplasmoidaceae 1.210259e-02 0.0270262454
f__Enterobacteriaceae 1.077111e-02 0.0651262303
f__Fusobacteriaceae 9.053249e-03 0.0370456782
f__Akkermansiaceae 8.677367e-03 0.0108836065
f__CAG-239 7.003283e-03 0.0098019806
f__LL51 6.656864e-03 0.0216352391
f__Anaerotignaceae 6.551732e-03 0.0074061779
f__UBA3830 5.185462e-03 0.0171469663
f__Gastranaerophilaceae 4.003188e-03 0.0059975426
f__Muribaculaceae 3.991492e-03 0.0409006227
f__Butyricicoccaceae 3.919222e-03 0.0050520985
f__CAG-274 3.051617e-03 0.0060828463
f__Acutalibacteraceae 2.723615e-03 0.0047471119
f__Anaerovoracaceae 2.683446e-03 0.0040557355
f__Pumilibacteraceae 2.544525e-03 0.0041701010
f__UBA1997 2.383785e-03 0.0080270055
f__CAG-508 2.324078e-03 0.0063109845
f__Brevinemataceae 2.191263e-03 0.0097068603
f__Peptococcaceae 1.937490e-03 0.0031961496
f__Rhodocyclaceae 1.936615e-03 0.0194735479
f__DTU072 1.792884e-03 0.0055918788
f__UBA660 1.742458e-03 0.0054664441
f__MGBC116941 1.714356e-03 0.0090980913
f__Massilibacillaceae 1.502892e-03 0.0024528334
f__Eggerthellaceae 1.377525e-03 0.0037174906
f__Enterococcaceae 8.454990e-04 0.0070848098
f__CALTSX01 5.198677e-04 0.0053270584
f__Chlamydiaceae 5.170637e-04 0.0030492819
f__Mucispirillaceae 4.593259e-04 0.0026630822
f__CALVMC01 4.496143e-04 0.0018583730
f__Clostridiaceae 4.212744e-04 0.0029132915
f__Acidaminococcaceae 4.004438e-04 0.0015777220
f__UBA1242 3.719350e-04 0.0014704441
f__RUG11792 3.717143e-04 0.0019425248
f__CAG-465 3.497207e-04 0.0015695529
f__Microbacteriaceae 3.199134e-04 0.0026999464
f__CAG-288 2.985206e-04 0.0017514912
f__Streptococcaceae 2.479473e-04 0.0018438849
f__Anaplasmataceae 2.435113e-04 0.0021508694
f__Hepatoplasmataceae 2.362221e-04 0.0024205565
f__Aeromonadaceae 2.327082e-04 0.0022486997
f__Peptostreptococcaceae 1.412306e-04 0.0014471826
f__Rhodobacteraceae 1.371859e-04 0.0014057373
f__Xanthomonadaceae 9.245878e-05 0.0009474206
f__Synergistaceae 7.840493e-05 0.0008034115
f__Lactobacillaceae 4.203166e-05 0.0004306963
f__Turicibacteraceae 0.000000e+00 0.0000000000
family_arrange <- family_summary %>%
    group_by(family) %>%
    summarise(mean=sum(relabun)) %>%
    arrange(-mean) %>%
    select(family) %>%
    pull()

family_summary %>%
    left_join(genome_metadata %>% select(family,phylum) %>% unique(),by=join_by(family==family)) %>%
    left_join(sample_metadata,by=join_by(sample==EHI_number)) %>%
    filter(family %in% family_arrange[1:20]) %>%
    mutate(family=factor(family,levels=rev(family_arrange[1:20]))) %>%
    filter(relabun > 0) %>%
    ggplot(aes(x=relabun, y=family, group=family, color=phylum)) +
        scale_color_manual(values=phylum_colors[-8]) +
        geom_jitter(alpha=0.5) + 
        theme_minimal() + 
        labs(y="Family", x="Relative abundance", color="Phylum")

6.2.2 Genus

genus_summary <- genome_counts_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS normalisation
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
  left_join(sample_metadata, by = join_by(sample == EHI_number)) %>% #append sample metadata
  left_join(genome_metadata, by = join_by(genome == genome)) %>% #append genome metadata
  group_by(sample,genus) %>%
  summarise(relabun=sum(count)) %>%
  filter(genus != "g__")

genus_summary %>%
    group_by(genus) %>%
    summarise(mean=mean(relabun),sd=sd(relabun)) %>%
    arrange(-mean) %>%
    tt()
tinytable_n53rysac6a15ir0hhv4i
genus mean sd
g__Bacteroides 1.725925e-01 0.0994470682
g__Parabacteroides 1.103381e-01 0.0767310570
g__Roseburia 5.077999e-02 0.0743624266
g__Phocaeicola 3.874581e-02 0.0435882803
g__JAAYNV01 3.526845e-02 0.0735025649
g__Odoribacter 3.376557e-02 0.0254845145
g__Helicobacter_J 2.983823e-02 0.0400574428
g__CAG-95 1.659217e-02 0.0240471187
g__Alistipes 1.555908e-02 0.0156740595
g__Kineothrix 1.544351e-02 0.0584815409
g__MGBC136627 1.451743e-02 0.0230914933
g__Mycoplasmoides 1.139564e-02 0.0269054026
g__Hungatella_A 1.113992e-02 0.0670243947
g__Anaerotruncus 1.006956e-02 0.0112599653
g__Velocimicrobium 9.687390e-03 0.0159382742
g__Enterocloster 9.506772e-03 0.0098451213
g__Fusobacterium_A 9.053249e-03 0.0370456782
g__Acetatifactor 8.989245e-03 0.0139417426
g__Akkermansia 8.677367e-03 0.0108836065
g__Clostridium_Q 7.770136e-03 0.0138182274
g__Bilophila 7.347777e-03 0.0088824113
g__Lawsonia 7.130440e-03 0.0356135290
g__Intestinimonas 6.565049e-03 0.0061282758
g__Lacrimispora 6.397950e-03 0.0078443852
g__Lachnotalea 5.679000e-03 0.0086820363
g__Desulfovibrio 5.604601e-03 0.0085133819
g__MGBC140009 5.533083e-03 0.0134005650
g__Extibacter 5.301402e-03 0.0438086011
g__Coprobacillus 5.255707e-03 0.0160178991
g__Ventrimonas 5.038137e-03 0.0083320432
g__NHYM01 4.907573e-03 0.0427691471
g__Dielma 4.863951e-03 0.0065480876
g__Eisenbergiella 4.698317e-03 0.0069699563
g__CHH4-2 4.463440e-03 0.0044701587
g__RGIG4733 4.252340e-03 0.0103379868
g__Negativibacillus 4.038946e-03 0.0054742770
g__Thomasclavelia 3.858196e-03 0.0117540652
g__Hungatella 3.440712e-03 0.0046402483
g__C-19 3.334625e-03 0.0097460323
g__Citrobacter 3.199667e-03 0.0207814167
g__UMGS1251 2.883830e-03 0.0066485409
g__Oscillibacter 2.697945e-03 0.0038453712
g__CAZU01 2.687821e-03 0.0062923086
g__Breznakia 2.569391e-03 0.0076690515
g__Copromonas 2.559264e-03 0.0037548588
g__Mailhella 2.486840e-03 0.0034349591
g__Pseudoflavonifractor 2.349829e-03 0.0028250595
g__Intestinibacillus 2.290036e-03 0.0027696754
g__Escherichia 2.232632e-03 0.0160096591
g__Brevinema 2.191263e-03 0.0097068603
g__MGBC165282 2.146060e-03 0.0051869977
g__Rikenella 2.123615e-03 0.0033608364
g__Morganella 2.019952e-03 0.0206983454
g__Robinsoniella 2.000942e-03 0.0198167695
g__Parabacteroides_B 1.950084e-03 0.0064473560
g__Hafnia 1.939718e-03 0.0112340944
g__Fluviibacter 1.936615e-03 0.0194735479
g__JAIHAL01 1.909337e-03 0.0043937983
g__CAJLXD01 1.809579e-03 0.0041385093
g__Marseille-P3106 1.729807e-03 0.0025326958
g__UBA866 1.535703e-03 0.0026149730
g__MGBC116941 1.433278e-03 0.0090996222
g__Duncaniella 1.426611e-03 0.0146184139
g__Stoquefichus 1.418761e-03 0.0046100976
g__Limenecus 1.393591e-03 0.0029887106
g__JAAYQI01 1.271605e-03 0.0020016472
g__RGIG6463 1.268390e-03 0.0028982477
g__Lawsonibacter 1.177407e-03 0.0016547788
g__Scatousia 1.174334e-03 0.0033329854
g__MGBC101980 1.147280e-03 0.0043856608
g__Hespellia 1.102861e-03 0.0080033094
g__Clostridium_AQ 1.075206e-03 0.0036972777
g__Tidjanibacter 1.062223e-03 0.0029392145
g__Fournierella 1.021882e-03 0.0022586979
g__Eggerthella 9.946416e-04 0.0031801193
g__OM05-12 9.566920e-04 0.0022952664
g__CALXRO01 9.555497e-04 0.0060396607
g__CALURL01 9.088777e-04 0.0021653921
g__Harryflintia 8.906013e-04 0.0020436518
g__MGBC133411 8.872215e-04 0.0021982622
g__Scatacola_A 8.685187e-04 0.0028129788
g__JALFVM01 8.392923e-04 0.0020111922
g__Bacteroides_G 8.358400e-04 0.0024999742
g__Ventrisoma 8.334289e-04 0.0015664780
g__CAG-269 8.185801e-04 0.0029513181
g__IOR16 8.132853e-04 0.0024147185
g__CAG-873 7.794020e-04 0.0079864937
g__Buttiauxella 7.562352e-04 0.0062186311
g__14-2 7.242943e-04 0.0014229865
g__Ureaplasma 7.069544e-04 0.0022811697
g__Scatocola 6.903112e-04 0.0024144468
g__Dysosmobacter 6.880800e-04 0.0012817741
g__Muricomes 6.843281e-04 0.0023723812
g__Anaerovorax 6.779999e-04 0.0017521412
g__UBA7185 6.690992e-04 0.0018509746
g__Butyricimonas 6.467068e-04 0.0016133034
g__MGBC131033 6.443020e-04 0.0015998136
g__Evtepia 6.309865e-04 0.0010453519
g__CAJMNU01 6.281598e-04 0.0008908563
g__Beduini 5.914594e-04 0.0013148807
g__Muribaculum 5.550053e-04 0.0056871120
g__Scandinavium 5.392506e-04 0.0034848792
g__Lactonifactor 5.340135e-04 0.0013169750
g__CALTSX01 5.198677e-04 0.0053270584
g__CAG-485 5.128011e-04 0.0052546481
g__Merdicola 5.109908e-04 0.0018183944
g__Ventrenecus 4.954074e-04 0.0024443063
g__UMGS1202 4.885966e-04 0.0016971062
g__Copranaerobaculum 4.827706e-04 0.0029167211
g__NSJ-61 4.574747e-04 0.0014044413
g__Faecimonas 4.467204e-04 0.0017621834
g__RGIG8482 4.415175e-04 0.0020991286
g__Faecivivens 4.313538e-04 0.0008927244
g__RGIG9287 4.228193e-04 0.0021002237
g__Sarcina 4.212744e-04 0.0029132915
g__Blautia_A 4.144742e-04 0.0010034450
g__Scatenecus 4.103021e-04 0.0026534948
g__Phascolarctobacterium 4.004438e-04 0.0015777220
g__Raoultibacter 3.828831e-04 0.0011609168
g__Caccovivens 3.719350e-04 0.0014704441
g__CAJTFG01 3.690163e-04 0.0010223015
g__HGM11386 3.642876e-04 0.0016060882
g__CAG-465 3.497207e-04 0.0015695529
g__Amedibacillus 3.457091e-04 0.0023834921
g__Enterococcus_A 3.276668e-04 0.0023252026
g__UMGS2016 3.212305e-04 0.0012912770
g__Emergencia 3.205926e-04 0.0009702487
g__Holdemania 3.098282e-04 0.0010287045
g__Blautia 3.095400e-04 0.0011077317
g__Protoclostridium 3.067429e-04 0.0010837140
g__Fimivivens 3.043536e-04 0.0008104254
g__RGIG7389 2.985465e-04 0.0005848732
g__CAG-345 2.985206e-04 0.0017514912
g__UBA7173 2.963722e-04 0.0030369115
g__Bariatricus 2.928468e-04 0.0008379528
g__Agathobaculum 2.787287e-04 0.0016389968
g__CALXDZ01 2.660973e-04 0.0006132220
g__UBA940 2.621830e-04 0.0009075589
g__Microbacterium 2.563077e-04 0.0026263723
g__Aminipila 2.506050e-04 0.0007479203
g__Lactococcus 2.479473e-04 0.0018438849
g__Wolbachia 2.435113e-04 0.0021508694
g__Paramuribaculum 2.432436e-04 0.0024925056
g__Hepatoplasma 2.362221e-04 0.0024205565
g__Aeromonas 2.327082e-04 0.0022486997
g__WRHT01 2.186196e-04 0.0006955370
g__Zhenpiania 2.116559e-04 0.0012708329
g__UBA5026 2.112884e-04 0.0009778887
g__UMGS1663 1.999591e-04 0.0007286034
g__MGBC107952 1.752651e-04 0.0009887232
g__CALXEL01 1.725652e-04 0.0013728689
g__CAG-273 1.482564e-04 0.0007864691
g__Clostridioides 1.412306e-04 0.0014471826
g__Paracoccus 1.371859e-04 0.0014057373
g__JAAWBF01 1.286984e-04 0.0006144643
g__JAFLTL01 1.270703e-04 0.0013020827
g__Bacteroides_H 1.267213e-04 0.0012985068
g__RUG12867 9.254620e-05 0.0006129332
g__Stenotrophomonas 9.245878e-05 0.0009474206
g__Rahnella 8.365618e-05 0.0008059384
g__Lumbricidophila 6.360575e-05 0.0006517650
g__UBA3263 5.098641e-05 0.0005224553
g__Fructobacillus 4.203166e-05 0.0004306963
g__Clostridium 0.000000e+00 0.0000000000
g__Turicibacter 0.000000e+00 0.0000000000
genus_arrange <- genus_summary %>%
    group_by(genus) %>%
    summarise(mean=sum(relabun)) %>%
    filter(genus != "g__")%>%
    arrange(-mean) %>%
    select(genus) %>%
    mutate(genus= sub("^g__", "", genus)) %>%
    pull()

genus_summary %>%
    left_join(genome_metadata %>% select(genus,phylum) %>% unique(),by=join_by(genus==genus)) %>%
    left_join(sample_metadata,by=join_by(sample==EHI_number)) %>%
    mutate(genus= sub("^g__", "", genus)) %>%
    filter(genus %in% genus_arrange[1:20]) %>%
    mutate(genus=factor(genus,levels=rev(genus_arrange[1:20]))) %>%
    filter(relabun > 0) %>%
    ggplot(aes(x=relabun, y=genus, group=genus, color=phylum)) +
        scale_color_manual(values=phylum_colors[-c(3,4,6,8)]) +
        geom_jitter(alpha=0.5) + 
        theme_minimal() + 
        labs(y="Family", x="Relative abundance", color="Phylum")

7 Diversity analyses

load("data/data.Rdata")

7.1 Alpha diversity

# Calculate Hill numbers
richness <- genome_counts_filt %>%
  column_to_rownames(var = "genome") %>%
  dplyr::select(where(~ !all(. == 0))) %>%
  hilldiv(., q = 0) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(richness = 1) %>%
  rownames_to_column(var = "sample")

neutral <- genome_counts_filt %>%
  column_to_rownames(var = "genome") %>%
  dplyr::select(where(~ !all(. == 0))) %>%
  hilldiv(., q = 1) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(neutral = 1) %>%
  rownames_to_column(var = "sample")

phylogenetic <- genome_counts_filt %>%
  column_to_rownames(var = "genome") %>%
  dplyr::select(where(~ !all(. == 0))) %>%
  hilldiv(., q = 1, tree = genome_tree) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(phylogenetic = 1) %>%
  rownames_to_column(var = "sample")

# Aggregate basal GIFT into elements
#Get list of present MAGs
present_MAGs <- genome_counts_filt %>%
    column_to_rownames(var = "genome") %>%
        filter(rowSums(.[, -1]) != 0) %>%
        rownames()

#Align KEGG annotations with present MAGs and remove all-zero and all-one traits
present_MAGs <- present_MAGs[present_MAGs %in% rownames(genome_gifts)]
genome_gifts_filt <- genome_gifts[present_MAGs,] %>%
            select_if(~!all(. == 0)) %>%  #remove all-zero modules
            select_if(~!all(. == 1)) #remove all-one modules

#Filter count table to only contain present MAGs after KEGG filtering
genome_counts_filt_filt <- genome_counts_filt %>%
  column_to_rownames(var = "genome")

genome_counts_filt_filt <- genome_counts_filt_filt[present_MAGs,]


dist <- genome_gifts_filt %>%
  to.elements(., GIFT_db) %>%
  traits2dist(., method = "gower")

functional <- genome_counts_filt_filt %>%
  #column_to_rownames(var = "genome") %>%
  #dplyr::select(where(~ !all(. == 0))) %>%
  hilldiv(., q = 1, dist = dist) %>%
  t() %>%
  as.data.frame() %>%
  dplyr::rename(functional = 1) %>%
  rownames_to_column(var = "sample") %>%
  mutate(functional = if_else(is.nan(functional), 1, functional))

# Merge all metrics
alpha_div <- richness %>%
  full_join(neutral, by = join_by(sample == sample)) %>%
  full_join(phylogenetic, by = join_by(sample == sample)) %>%
  full_join(functional, by = join_by(sample == sample))
alpha_div %>%
  pivot_longer(-sample, names_to = "alpha", values_to = "value") %>%
  left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
    group_by(alpha)%>%
    summarise(
              Aisa_mean=mean(value[Transect=="Aisa"], na.rm=T),
              Aisa_sd=sd(value[Transect=="Aisa"], na.rm=T),
              Aran_mean=mean(value[Transect=="Aran"], na.rm=T),
              Aran_sd=sd(value[Transect=="Aran"], na.rm=T),
              Sentein_mean=mean(value[Transect=="Sentein"], na.rm=T),
              Sentein_sd=sd(value[Transect=="Sentein"], na.rm=T),
              Tourmalet_mean=mean(value[Transect=="Tourmalet"], na.rm=T),
              Tourmalet_sd=sd(value[Transect=="Tourmalet"], na.rm=T))%>%
    mutate(
           Aisa=str_c(round(Aisa_mean,2),"±",round(Aisa_sd,2)),
           Aran=str_c(round(Aran_mean,2),"±",round(Aran_sd,2)),
           Sentein=str_c(round(Sentein_mean,2),"±",round(Sentein_sd,2)),
           Tourmalet=str_c(round(Tourmalet_mean,2),"±",round(Tourmalet_sd,2))) %>% 
    arrange(-Aisa_mean) %>% 
    dplyr::select(alpha,Aisa,Aran,Sentein,Tourmalet) %>% 
    tt()
tinytable_1o0sw5xv50wpmb5bkpl7
alpha Aisa Aran Sentein Tourmalet
richness 198.77±65.73 153.32±72.26 257.32±71.08 245.04±68.73
neutral 87.81±36.55 71.13±36.69 110.98±45.84 101.5±43.7
phylogenetic 4.98±1.13 4.81±1.12 5.41±0.86 5.29±0.9
functional 1.5±0.05 1.5±0.06 1.52±0.03 1.5±0.04
alpha_div %>%
  pivot_longer(-sample, names_to = "metric", values_to = "value") %>%
  left_join(., sample_metadata, by = join_by(sample == EHI_number)) %>%
  mutate(metric=factor(metric,levels=c("richness","neutral","phylogenetic","functional"))) %>%
      ggplot(aes(y = value, x = Transect, group=Transect, color=Transect, fill=Transect)) +
      geom_boxplot(outlier.shape = NA) +
      geom_jitter(alpha=0.5) +
      scale_color_manual(name="Transect",
          breaks=c("Aisa","Aran","Sentein","Tourmalet"),
          labels=c("Aisa","Aran","Sentein","Tourmalet"),
          values=c("#e5bd5b", "#6b7398","#e2815a", "#876b96")) +
      scale_fill_manual(name="Transect",
          breaks=c("Aisa","Aran","Sentein","Tourmalet"),
          labels=c("Aisa","Aran","Sentein","Tourmalet"),
          values=c("#e5bd5b50", "#6b739850","#e2815a50", "#876b9650")) +
      facet_wrap(. ~ metric, scales = "free", ncol=4) +
      coord_cartesian(xlim = c(1, NA)) +
      stat_compare_means(size=2) +
      theme_classic() +
      theme(
        strip.background = element_blank(),
        panel.grid.minor.x = element_line(size = .1, color = "grey"),
        axis.text.x = element_text(angle = 45, hjust = 1)
      ) +
  ylab("Alpha diversity")      

7.1.1 Regression plots

7.1.1.1 Richness diversity

columns <- c("richness","neutral","phylo","func","mapped","total")
alpha_div %>%
            select(sample,richness) %>%
            pivot_longer(-sample, names_to = "data", values_to = "value") %>%
            mutate(data = factor(data, levels = columns))   %>%
            left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
      ggplot(aes(x = Elevation, y = value)) +
        geom_point() +
        geom_smooth(method = lm) +
        facet_wrap(~ factor(Transect))+
        labs(x = "Elevation (m)")

7.1.1.2 Neutral diversity

alpha_div %>%
            select(sample,neutral) %>%
            pivot_longer(-sample, names_to = "data", values_to = "value") %>%
            mutate(data = factor(data, levels = columns))   %>%
            left_join(sample_metadata, by = join_by(sample == EHI_number))  %>%
      ggplot(aes(x = Elevation, y = value)) +
        geom_point() +
        geom_smooth(method = lm) +
        facet_wrap(~ factor(Transect))+
        labs(x = "Elevation (m)")

7.1.1.3 Phylogenetic diversity

alpha_div %>%
            select(sample,phylogenetic) %>%
            pivot_longer(-sample, names_to = "data", values_to = "value") %>%
            mutate(data = factor(data, levels = columns))   %>%
            left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
      ggplot(aes(x = Elevation, y = value)) +
        geom_point() +
        geom_smooth(method = lm) +
        facet_wrap(~ factor(Transect))+
        labs(x = "Elevation (m)")

7.1.1.4 Functional diversities

alpha_div %>%
            select(sample,functional) %>%
            pivot_longer(-sample, names_to = "data", values_to = "value") %>%
            mutate(data = factor(data, levels = columns))   %>%
            left_join(sample_metadata, by = join_by(sample == EHI_number))  %>%
      ggplot(aes(x = Elevation, y = value)) +
        geom_point() +
        geom_smooth(method = lm) +
        facet_wrap(~ factor(Transect))+
        labs(x = "Elevation (m)")

7.1.2 Mixed models

7.1.2.1 Richness diversity

richness_cor<-alpha_div %>%
  left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
  lmerTest::lmer(richness ~ log(sequencing_depth) + Elevation*Transect + (1 | Sampling_point), data = ., REML = FALSE) %>%
  broom.mixed::tidy() %>%
  tt()
tinytable_tkljg9ny5ibr1kvwpvw1
effect group term estimate std.error statistic df p.value
fixed NA (Intercept) -1.556876e+03 213.86817441 -7.279608 102.67218 6.967323e-11
fixed NA log(sequencing_depth) 1.089104e+02 11.22012642 9.706700 101.19457 3.904741e-16
fixed NA Elevation -5.945361e-02 0.05333190 -1.114785 19.97815 2.781739e-01
fixed NA TransectAran -1.456675e+02 95.32890330 -1.528052 19.77139 1.423407e-01
fixed NA TransectSentein -1.282416e+02 98.14650386 -1.306634 20.96734 2.054881e-01
fixed NA TransectTourmalet -1.937802e+02 91.46019166 -2.118739 20.50091 4.650671e-02
fixed NA Elevation:TransectAran 9.264522e-02 0.06209640 1.491958 19.59034 1.516391e-01
fixed NA Elevation:TransectSentein 1.120611e-01 0.06399044 1.751216 21.21597 9.435497e-02
fixed NA Elevation:TransectTourmalet 1.204621e-01 0.05764375 2.089769 20.55707 4.926134e-02
ran_pars Sampling_point sd__(Intercept) 1.411515e+01 NA NA NA NA
ran_pars Residual sd__Observation 4.636663e+01 NA NA NA NA

7.1.2.2 Neutral diversity

neutral_cor<-alpha_div %>%
  left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
  lmerTest::lmer(neutral ~ log(sequencing_depth) + Elevation*Transect + (1|Sampling_point), data = ., REML = FALSE) %>%
  broom.mixed::tidy() %>%
  tt()
tinytable_sy9ww8c9uktz6txy1cpe
effect group term estimate std.error statistic df p.value
fixed NA (Intercept) -503.46092882 155.88510189 -3.229692 102.94139 1.663714e-03
fixed NA log(sequencing_depth) 38.53316355 8.30102716 4.641975 102.38427 1.026312e-05
fixed NA Elevation -0.04029799 0.03545720 -1.136525 17.82419 2.707869e-01
fixed NA TransectAran -69.54734931 63.33417863 -1.098101 17.59583 2.869598e-01
fixed NA TransectSentein -73.46205358 65.46915017 -1.122087 19.00703 2.758013e-01
fixed NA TransectTourmalet -97.07019202 60.91625317 -1.593502 18.45946 1.280264e-01
fixed NA Elevation:TransectAran 0.04308459 0.04122860 1.045017 17.37945 3.103332e-01
fixed NA Elevation:TransectSentein 0.05934449 0.04271777 1.389222 19.28043 1.806027e-01
fixed NA Elevation:TransectTourmalet 0.05957175 0.03839934 1.551374 18.50873 1.377430e-01
ran_pars Sampling_point sd__(Intercept) 6.64785214 NA NA NA NA
ran_pars Residual sd__Observation 34.74128890 NA NA NA NA

7.1.2.3 Phylogenetic diversity

phylo_cor<-alpha_div %>%
  left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
  lmerTest::lmer(phylogenetic ~ log(sequencing_depth) + Elevation*Transect + (1 | Sampling_point), data = ., REML = FALSE) %>%
  broom.mixed::tidy() %>%
  tt()
tinytable_4b7ilkky3tko5w6llzkg
effect group term estimate std.error statistic df p.value
fixed NA (Intercept) 7.9723798522 4.1857148123 1.9046639 103.16183 0.05960912
fixed NA log(sequencing_depth) -0.0638680767 0.2249894059 -0.2838715 103.46820 0.77707614
fixed NA Elevation -0.0012366705 0.0008881157 -1.3924655 18.17137 0.18058793
fixed NA TransectAran -0.7649418524 1.5853816744 -0.4824970 17.90442 0.63529916
fixed NA TransectSentein -1.7363345713 1.6449704378 -1.0555415 19.63947 0.30399034
fixed NA TransectTourmalet -3.2228577094 1.5284405960 -2.1085921 18.97101 0.04850288
fixed NA Elevation:TransectAran 0.0002832543 0.0010313958 0.2746320 17.63476 0.78679044
fixed NA Elevation:TransectSentein 0.0014224302 0.0010740261 1.3243907 19.95532 0.20034248
fixed NA Elevation:TransectTourmalet 0.0022220955 0.0009635780 2.3060877 19.01432 0.03253476
ran_pars Sampling_point sd__(Intercept) 0.0488567254 NA NA NA NA
ran_pars Residual sd__Observation 0.9517063007 NA NA NA NA

7.1.2.4 Functional diversities

func_cor<-alpha_div %>%
  left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
  lmerTest::lmer(functional ~ log(sequencing_depth) + Elevation*Transect + (1 | Sampling_point), data = ., REML = FALSE) %>%
  broom.mixed::tidy() %>%
  tt()
tinytable_5r2w7jo9peq6f70voyy0
effect group term estimate std.error statistic df p.value
fixed NA (Intercept) 1.604632e+00 2.146120e-01 7.47689687 105 2.400124e-11
fixed NA log(sequencing_depth) -5.072130e-03 1.154428e-02 -0.43936293 105 6.613015e-01
fixed NA Elevation -1.096272e-05 4.526467e-05 -0.24219152 105 8.091042e-01
fixed NA TransectAran 2.877083e-02 8.079764e-02 0.35608504 105 7.224913e-01
fixed NA TransectSentein -5.637786e-03 8.386423e-02 -0.06722516 105 9.465303e-01
fixed NA TransectTourmalet -6.778434e-02 7.791307e-02 -0.86999962 105 3.862852e-01
fixed NA Elevation:TransectAran -2.198663e-05 5.256109e-05 -0.41830623 105 6.765775e-01
fixed NA Elevation:TransectSentein 1.445851e-05 5.475949e-05 0.26403659 105 7.922692e-01
fixed NA Elevation:TransectTourmalet 4.317958e-05 4.911932e-05 0.87907520 105 3.813678e-01
ran_pars Sampling_point sd__(Intercept) 0.000000e+00 NA NA NA NA
ran_pars Residual sd__Observation 4.887921e-02 NA NA NA NA

7.2 Beta diversity

beta_q0n <- genome_counts_filt %>%
  column_to_rownames(., "genome") %>%
  hillpair(., q = 0)

beta_q1n <- genome_counts_filt %>%
  column_to_rownames(., "genome") %>%
  hillpair(., q = 1)

beta_q1p <- genome_counts_filt %>%
  column_to_rownames(., "genome") %>%
  hillpair(., q = 1, tree = genome_tree)

beta_q1f <- genome_counts_filt_filt %>%
  #column_to_rownames(., "genome") %>%
  hillpair(., q = 1, dist = dist)

8 HMSC set-up

8.1 Load data

load("data/data.Rdata")

8.2 Subsetting

# Subset by prevalence (present in more than 5 samples)
selected_genomes1 <- genome_counts %>%
    filter(rowSums(across(starts_with("EHI")) != 0) >= 5) %>%
    select(genome) %>% pull()

# Subset by minimum representation of 1% relative abundance in 5 samples
selected_genomes2 <- genome_counts %>%
    filter(genome %in% selected_genomes1) %>%
    column_to_rownames(var="genome") %>%
    tss() %>%
    as.data.frame() %>%
    filter(rowSums(across(starts_with("EHI")) >= 0.01) >= 5) %>%
    rownames()

# Subset genome metadata
genome_metadata_subset <- genome_metadata %>%
    filter(genome %in% selected_genomes2)
# Random effects data (study design)
StudyDesign <- sample_metadata %>%
                    select(EHI_number,Sampling_point,Transect, Elevation) %>%
                    mutate(Sampling_point = factor(Sampling_point)) %>%
                    mutate(Transect = factor(Transect)) %>%
                    mutate(Elevation = factor(Elevation)) %>%
                    column_to_rownames("EHI_number")

# Genome count table (quantitative community data)
YData <- read_counts  %>%
                    filter(genome %in% selected_genomes2) %>% #subset genomes
                    mutate(across(where(is.numeric), ~ . +1 )) %>% #add +1 pseudocount to remove zeros
                    mutate(across(where(is.numeric), ~ . / (genome_metadata_subset$length / 150) )) %>% #transform to genome counts
                    mutate(across(where(is.numeric), ~  log(.) )) %>% #log-transform
                    column_to_rownames("genome") %>%
                    select(all_of(row.names(StudyDesign))) %>%  #filter only faecal samples
                    as.data.frame() %>%
                    t() # transpose

# Fixed effects data (explanatory variables)
XData <- sample_metadata %>%
                    select(EHI_number,Elevation,Tree_cover, Anthropization_cover) %>%
                    mutate(logseqdepth=read_counts %>% #total log-sequencing depth
                        select(all_of(row.names(StudyDesign))) %>%
                        colSums() %>%
                        log()
                    ) %>%
                    column_to_rownames("EHI_number")


# Genome trait data (not needed now, and it gives error in to.functions section)
#TrData <- genome_gifts %>%
                    #arrange(match(genome, colnames(YData))) %>%
                    #column_to_rownames(var="genome") %>%
                    #to.functions(.,GIFT_db) %>%
                    #as.data.frame()

# Genome phylogeny
PData <- genome_tree

8.3 Define formulas of the Hmsc model

# Fixed effects formula
XFormula1 = ~Elevation + Tree_cover + Anthropization_cover + logseqdepth

# Study design
rL.sampling_point = HmscRandomLevel(units = levels(StudyDesign$Sampling_point))
rL.transect = HmscRandomLevel(units = levels(StudyDesign$Transect))

8.4 Define and Hmsc models

#Define models
model1 = Hmsc(Y=YData,
         XData = XData,
         XFormula = XFormula1,
         studyDesign = StudyDesign,
         phyloTree = PData,
         ranLevels = list("Sampling_point"=rL.sampling_point, "Transect"=rL.transect),
         distr = "normal",
         YScale = TRUE)

#Save list of models as an R object.
model_list = list(model1=model1)
if (!dir.exists("hmsc")){dir.create("hmsc")}
save(model_list, file = "hmsc/hmsc.Rdata")

Upload hmsc/hmsc.Rdata to the HPC respecting the directory structure.

8.5 Define MCMC

# How often to sample the MCMC
MCMC_samples_list = 250

# The number of MCMC steps between each recording sample
MCMC_thin_list = c(1, 10)

# The number of MCMC chains to use
nChains = 4

8.6 Generate Hmsc executables

The next chunk generates shell files for every combination of model, MCMC samples and MCMM thinning, ready to be launched as SLURM jobs.

modelchains <- expand.grid(model = names(model_list), sample = MCMC_samples_list, thin = MCMC_thin_list)

if (!dir.exists("hmsc")){dir.create("hmsc")}
for(i in c(1:nrow(modelchains))){
      modelname=as.character(modelchains[i,1])
      sample=modelchains[i,2]
      thin=modelchains[i,3]
      executablename <- paste0("hmsc/exe_",modelname,"_",sample,"_",thin,".sh")
      fitname <- paste0("hmsc/fit_",modelname,"_",sample,"_",thin,".Rdata")
      convname <- paste0("hmsc/conv_",modelname,"_",sample,"_",thin,".Rdata")
      model <- paste0('model_list$',modelname)
      psrf.beta.name <-  paste0("psrf.beta.",modelname,"_",sample,"_",thin)
      psrf.gamma.name <-  paste0("psrf.gamma.",modelname,"_",sample,"_",thin)
      psrf.rho.name <-  paste0("psrf.rho.",modelname,"_",sample,"_",thin)
      jobname <- paste0("hmsc_",modelname,"_",sample,"_",thin)
      minutes <- round(sample * thin * (ncol(YData)/500), 0)
      code <- sprintf("#!/bin/bash
#SBATCH --job-name=%s                   # Job name
#SBATCH --nodes=1
#SBATCH --ntasks=4                      # Run on 4 CPUs
#SBATCH --mail-user=garazi.bideguren@sund.ku.dk
#SBATCH --mem=800gb                      # Job memory request
#SBATCH --time=%d                       # In minutes

# Activate conda environment
module load mamba/1.3.1
source activate /maps/projects/mjolnir1/people/dlz554/hmsc_env

# Run R script
Rscript -e '
library(tidyverse)
library(Hmsc)
# Load formulas and data
load(\"hmsc/hmsc.Rdata\")

# Declare placeholders
modelname = \"%s\"
model = %s
fitname = \"%s\"
convname = \"%s\"
sample = %d
thin = %d
nchains = %d

# Run model fitting
m = sampleMcmc(hM = model,
         samples = sample,
         thin = thin,
         adaptNf=rep(ceiling(0.4*sample*thin),model$nr),
         transient = ceiling(0.5*sample*thin),
         nChains = nchains,
         nParallel = nchains)

# Assess chain convergence
mpost = convertToCodaObject(m,
      spNamesNumbers = c(T,F),
      covNamesNumbers = c(T,F),
      Beta = TRUE,
      Gamma = TRUE,
      V = FALSE,
      Sigma = FALSE,
      Rho = TRUE,
      Eta = FALSE,
      Lambda = FALSE,
      Alpha = FALSE,
      Omega = FALSE,
      Psi = FALSE,
      Delta = FALSE) # Convert to CODA object

# Fixed effects
assign(paste0(\"psrf.beta.\", modelname,\"_\",sample,\"_\",thin), gelman.diag(mpost$Beta,multivariate=FALSE)$psrf)

# Traits
assign(paste0(\"psrf.gamma.\", modelname,\"_\",sample,\"_\",thin), gelman.diag(mpost$Gamma,multivariate=FALSE)$psrf)

# Phylogeny
assign(paste0(\"psrf.rho.\", modelname,\"_\",sample,\"_\",thin), gelman.diag(mpost$Rho,multivariate=FALSE)$psrf)

# Write convergence data
save(%s, %s, %s, file=convname)

# Save model fit object
save(m, file=fitname)
'
", jobname, minutes, modelname, model, fitname, convname, sample, thin, nChains, psrf.beta.name, psrf.gamma.name, psrf.rho.name)
      writeLines(code, executablename)
    }

Upload the produced hmsc/exe_XXXXX.sh files to the HPC respecting the directory structure.

8.7 Fit Hmsc models (in Mjolnir HPC)

Launch the SLURM jobs by using:

# Submit all .sh files in the hmsc folder
for jobfile in hmsc/exe_*.sh; do
    sbatch "$jobfile"
done

#Or launch them one by one only the ones you want to launch
sbatch hmsc/exe_model1_250_1.sh
sbatch hmsc/exe_model1_250_10.sh

8.8 Assess chain convergence

Convergence diagnostic values substantially above 1 indicate lack of convergence. Values below 1.1 are considered good enough

# Load all conv file available in the hmsc folder
file_paths <-list.files(path = "hmsc_bookdown", pattern = "^conv_", full.names = TRUE, include.dirs = TRUE)
for (file_path in file_paths) {
    load(file_path, verbose = TRUE)  # Remove .GlobalEnv argument and specify verbose for each load operation
}
Loading objects:
  psrf.beta.model1_250_1
  psrf.gamma.model1_250_1
  psrf.rho.model1_250_1
Loading objects:
  psrf.beta.model1_250_10
  psrf.gamma.model1_250_10
  psrf.rho.model1_250_10
# Create a merged psrf.beta (genome) plot
ls() %>%
        grep("^psrf\\.beta", ., value = TRUE) %>%
        map_dfr(~ {
         mat <- get(.x)
          data.frame(modelchain = .x, as.data.frame(mat, , stringsAsFactors = FALSE)) %>%
              rownames_to_column(var="parameter") %>%
              mutate(model = str_split(modelchain, "_") %>% map_chr(1) %>% gsub("psrf.beta.","",.)) %>%
              mutate(sample = str_split(modelchain, "_")[[1]][2]) %>% #extract sample info from model name
              mutate(thin = str_split(modelchain, "_")[[1]][3]) #extract thin info from model name
      }) %>%
      ggplot(.,aes(x=reorder(modelchain,-Point.est.,fun=function(x) {quantile(x, probs = 0.9)}),y=Point.est.)) +
        geom_violin(fill="#b8d9e3", color="#328da8") +
        geom_jitter(alpha=0.3,size=0.2, color="#a8babf") +
        stat_summary(fun=function(x) {quantile(x, probs = 0.9)}, geom="crossbar", width=0.2, color="orange") +
        geom_hline(yintercept=1.1, linetype="dashed", color = "red") +
        ylim(0.9,2)+
        labs(x="Model chains",y="Parameter estimates")+
        theme_classic()+
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

9 HMSC analysis

9.1 Load data

load("data/data.Rdata")

9.2 Compute variance partitioning

# Select modelchain of interest
load("hmsc_bookdown/fit_model1_250_10.Rdata")

varpart=computeVariancePartitioning(m)

# Compute variance partitioning
varpart=computeVariancePartitioning(m)

varpart$vals %>%
   as.data.frame() %>%
   rownames_to_column(var="variable") %>%
   pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
   mutate(variable=factor(variable, levels=c("Elevation","Tree_cover", "Anthropization_cover", "logseqdepth","Random: Sampling_point", "Random: Transect"))) %>%
   group_by(variable) %>%
   summarise(mean=mean(value)*100,sd=sd(value)*100) %>%
   tt()
tinytable_tbjsr372hroj2t6a47tg
variable mean sd
Elevation 11.38945 10.849982
Tree_cover 6.97232 4.187276
Anthropization_cover 11.16491 5.066755
logseqdepth 25.34223 13.997659
Random: Sampling_point 27.93211 17.075779
Random: Transect 17.19899 12.998778
# Basal tree
varpart_tree <- genome_tree %>%
        keep.tip(., tip=m$spNames)

#Varpart table
varpart_table <- varpart$vals %>%
   as.data.frame() %>%
   rownames_to_column(var="variable") %>%
   pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
   mutate(variable=factor(variable, levels=rev(c("Elevation","Tree_cover","Anthropization_cover", "logseqdepth","Random: Sampling_point", "Random: Transect")))) %>%
   mutate(genome=factor(genome, levels=rev(varpart_tree$tip.label)))

#Phylums
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
  right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    filter(genome %in% varpart_tree$tip.label) %>%
    arrange(match(genome, varpart_tree$tip.label)) %>%
    mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
    column_to_rownames(var = "genome") %>%
    select(phylum)


colors_alphabetic <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
  right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    filter(genome %in% varpart_tree$tip.label) %>%
    arrange(match(genome, varpart_tree$tip.label)) %>%
     select(phylum, colors) %>%
    unique() %>%
    arrange(phylum) %>%
    select(colors) %>%
    pull()

# Basal ggtree
varpart_tree <- varpart_tree %>%
        force.ultrametric(.,method="extend") %>%
        ggtree(., size = 0.3)
***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************
# Add phylum colors next to the tree tips
varpart_tree <- gheatmap(varpart_tree, phylum_colors, offset=-0.2, width=0.1, colnames=FALSE) +
   scale_fill_manual(values=colors_alphabetic)+
      labs(fill="Phylum")

#Reset fill scale to use a different colour profile in the heatmap
varpart_tree <- varpart_tree + new_scale_fill()

# Add variance stacked barplot
vertical_tree <-  varpart_tree +
        scale_fill_manual(values=c("#34738f","#cccccc","#ed8a45","#b2b530","#be3e2b","#83bb90","#f6de6c", "#122f3d"))+
        geom_fruit(
             data=varpart_table,
             geom=geom_bar,
             mapping = aes(x=value, y=genome, fill=variable, group=variable),
             pwidth = 2,
             offset = 0.05,
             width= 1,
             orientation="y",
             stat="identity",
             color = "white",
             size=0.1)+
      labs(fill="Variable")

vertical_tree

# Select desired support threshold
support=0.9
negsupport=1-support

# Basal tree
postestimates_tree <- genome_tree %>%
        keep.tip(., tip=m$spNames)

#plotBeta(hM=m, post=getPostEstimate(hM=m, parName="Beta"), param = "Support", plotTree = TRUE, covNamesNumbers=c(1,0))

# Posterior estimate table
post_beta <- getPostEstimate(hM=m, parName="Beta")$support %>%
    as.data.frame() %>%
    mutate(variable=m$covNames) %>%
    pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
    mutate(genome=factor(genome, levels=rev(postestimates_tree$tip.label))) %>%
    mutate(value = case_when(
          value >= support ~ "Positive",
          value <= negsupport ~ "Negative",
          TRUE ~ "Neutral")) %>%
    mutate(value=factor(value, levels=c("Positive","Neutral","Negative"))) %>%
    pivot_wider(names_from = variable, values_from = value) %>%
    rename(intercept=2,
             Elevation=3,
           Tree_cover=4,
           Anthropization_cover=5,
             logseqdepth=6
           ) %>%
    select(genome,intercept,Elevation,Tree_cover,Anthropization_cover,logseqdepth) %>%
    column_to_rownames(var="genome")

#Phylums
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
  right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    filter(genome %in% postestimates_tree$tip.label) %>%
    arrange(match(genome, postestimates_tree$tip.label)) %>%
    mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
    column_to_rownames(var = "genome") %>%
    select(phylum)


colors_alphabetic <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
  right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    filter(genome %in% postestimates_tree$tip.label) %>%
    arrange(match(genome, postestimates_tree$tip.label)) %>%
     select(phylum, colors) %>%
    unique() %>%
    arrange(phylum) %>%
    select(colors) %>%
    pull()

# Basal ggtree
postestimates_tree <- postestimates_tree %>%
        force.ultrametric(.,method="extend") %>%
        ggtree(., size = 0.3)
***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************
#Add phylum colors next to the tree tips
postestimates_tree <- gheatmap(postestimates_tree, phylum_colors, offset=-0.2, width=0.1, colnames=FALSE) +
      scale_fill_manual(values=colors_alphabetic)+
      labs(fill="Phylum")

#Reset fill scale to use a different colour profile in the heatmap
postestimates_tree <- postestimates_tree + new_scale_fill()

# Add posterior significant heatmap

postestimates_tree <- gheatmap(postestimates_tree, post_beta, offset=0, width=0.5, colnames=TRUE, colnames_position="top",colnames_angle=90, colnames_offset_y=1, hjust=0) +
        scale_fill_manual(values=c("#be3e2b","#f4f4f4","#b2b530"))+
        labs(fill="Trend")

postestimates_tree +
        vexpand(.25, 1) # expand top

#Compute the residual correlation matrix
OmegaCor = computeAssociations(m)

# Reference tree (for sorting genomes)
genome_tree_subset <- genome_tree %>%
        keep.tip(., tip=m$spNames)


#Co-occurrence matrix at the animal level
supportLevel = 0.95
toPlot = ((OmegaCor[[1]]$support>supportLevel)
          + (OmegaCor[[1]]$support<(1-supportLevel))>0)*OmegaCor[[1]]$mean

matrix <- toPlot %>%
      as.data.frame() %>%
      rownames_to_column(var="genome1") %>%
      pivot_longer(!genome1, names_to = "genome2", values_to = "cor") %>%
      mutate(genome1= factor(genome1, levels=genome_tree_subset$tip.label)) %>%
      mutate(genome2= factor(genome2, levels=genome_tree_subset$tip.label)) %>%
      ggplot(aes(x = genome1, y = genome2, fill = cor)) +
            geom_tile() +
            scale_fill_gradient2(low = "#be3e2b",
                       mid = "#f4f4f4",
                       high = "#b2b530")+
            theme_void()

vtree <- genome_tree_subset %>%
  force.ultrametric(.,method="extend") %>%
  ggtree(.)
***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************
#create composite figure
grid.arrange(grobs = list(vtree,matrix,vtree),
             layout_matrix = rbind(c(4,1,1,1,1,1,1,1,1,1,1,1),
                                   c(3,2,2,2,2,2,2,2,2,2,2,2),
                                   c(3,2,2,2,2,2,2,2,2,2,2,2),
                                   c(3,2,2,2,2,2,2,2,2,2,2,2),
                                   c(3,2,2,2,2,2,2,2,2,2,2,2),
                                   c(3,2,2,2,2,2,2,2,2,2,2,2),
                                   c(3,2,2,2,2,2,2,2,2,2,2,2),
                                   c(3,2,2,2,2,2,2,2,2,2,2,2),
                                   c(3,2,2,2,2,2,2,2,2,2,2,2),
                                   c(3,2,2,2,2,2,2,2,2,2,2,2),
                                   c(3,2,2,2,2,2,2,2,2,2,2,2),
                                   c(3,2,2,2,2,2,2,2,2,2,2,2)))

9.3 Elevation predictions

gradient = seq(940, 2350, by = 100)
gradientlength = length(gradient)

#Treatment-specific gradient predictions
pred_elevation <- constructGradient(m,
                      focalVariable = "Elevation",
                      non.focalVariables = list(logseqdepth=list(1),Tree_cover=list(1), Anthropization_cover=list(1)),
                      ngrid=gradientlength) %>%
                      predict(m, Gradient = ., expected = TRUE) %>%
                      do.call(rbind,.) %>%
                      as.data.frame() %>%
                      mutate(elevation=rep(gradient,1000)) %>%
                      pivot_longer(-c(elevation), names_to = "genome", values_to = "value")

9.3.1 Responses to elevation

# Select desired support threshold
support=0.9
negsupport=1-support

#Get phylum colors from the EHI standard
phylum_colors <- genome_metadata %>%
    left_join(read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv"), by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree$tip.label)) %>%
    select(phylum, colors) %>%
    unique() %>%
    arrange(phylum) %>%
    #slice(2:5) %>%
    select(colors) %>%
    pull()

getPostEstimate(hM=m, parName="Beta")$support %>%
    as.data.frame() %>%
    mutate(variable=m$covNames) %>%
    pivot_longer(!variable, names_to = "genome", values_to = "value") %>%
    mutate(trend = case_when(
          value >= support ~ "Positive",
          value <= negsupport ~ "Negative",
          TRUE ~ "Neutral")) %>%
    filter(variable=="Elevation") %>%
    select(genome,trend) %>%
    left_join(pred_elevation, by=join_by(genome==genome)) %>%
    group_by(genome, trend, elevation) %>%
    summarize(value = mean(value, na.rm = TRUE)) %>%
    left_join(genome_metadata, by=join_by(genome == genome)) %>%
    ggplot(aes(x=elevation, y=value, group=genome, color=phylum, linetype=trend)) +
        geom_line() +
        scale_linetype_manual(values=c("solid","dashed","solid")) +
        scale_color_manual(values=phylum_colors) +
        facet_grid(fct_rev(trend) ~ phylum) +
        labs(y="Genome abundance (log)",x="Elevation") +
        theme(legend.position = "none") +
        theme_minimal() +
         theme(legend.position = "none",
               axis.text.x = element_text(angle = 45, hjust = 0.8,),
               axis.line.x = element_line(size = 0.5, linetype = "solid", colour = "black"),
)

9.4 Functional predictions

9.4.1 Element level

elements_table <- genome_gifts_filt %>%
    to.elements(., GIFT_db) %>%
    as.data.frame()

community_elements <- pred_elevation %>%
  group_by(elevation, genome) %>%
  mutate(row_id = row_number()) %>%
  pivot_wider(names_from = genome, values_from = value) %>%
  ungroup() %>%
  group_split(row_id) %>%
  as.list() %>%
  lapply(., FUN = function(x){x %>%
    select(-row_id) %>%
    column_to_rownames(var = "elevation") %>%
    as.data.frame() %>%
    exp() %>%
    t() %>%
    tss() %>%
    to.community(elements_table,.,GIFT_db) %>%
    as.data.frame() %>%
    rownames_to_column(var="elevation")
   })

calculate_slope <- function(x) {
  lm_fit <- lm(unlist(x) ~ seq_along(unlist(x)))
  coef(lm_fit)[2]
}

element_predictions <- map_dfc(community_elements, function(mat) {
      mat %>%
        column_to_rownames(var = "elevation") %>%
        t() %>%
        as.data.frame() %>%
        rowwise() %>%
        mutate(slope = calculate_slope(c_across(everything()))) %>%
        select(slope) }) %>%
      t() %>%
      as.data.frame() %>%
      set_names(colnames(community_elements[[1]])[-1]) %>%
      rownames_to_column(var="iteration") %>%
      pivot_longer(!iteration, names_to="trait",values_to="value") %>%
      group_by(trait) %>%
      summarise(mean=mean(value),
        p1 = quantile(value, probs = 0.1),
        p9 = quantile(value, probs = 0.9),
        positive_support = sum(value > 0)/1000,
        negative_support = sum(value < 0)/1000) %>%
      arrange(-positive_support)
# Positively associated
p0<-positive_filtered<-element_predictions %>%
  filter(mean >0) %>%
  arrange(-positive_support) %>%
  filter(positive_support>=0.9) 

if (nrow(positive_filtered) > 0) {
  positive_filtered %>%
  tt() |>
      style_tt(
        i = which(element_predictions$positive_support < 0.9 & element_predictions$negative_support < 0.1),
        background = "#E5D5B1") |>
      style_tt(
        i = which(element_predictions$negative_support > 0.9 & element_predictions$positive_support < 0.1),
        background = "#B7BCCE")
}


#Negatively associated
p1<-negative_filtered<-element_predictions %>%
  filter(mean <0) %>%
  arrange(-negative_support) %>%
  filter(negative_support>=0.9) 

if (nrow(negative_filtered) > 0) {
  negative_filtered %>%
  tt() |>
      style_tt(
        i = which(element_predictions$positive_support < 0.9 & element_predictions$negative_support < 0.1),
        background = "#E5D5B1") |>
      style_tt(
        i = which(element_predictions$negative_support > 0.9 & element_predictions$positive_support < 0.1),
        background = "#B7BCCE")
}
# Positively associated
p0 %>%
  tt()
tinytable_g1s79gh161hmrm9t8q4x
trait mean p1 p9 positive_support negative_support
D0613 0.034264683 0.0113319085 0.060146258 0.971 0.029
B0106 0.017715307 0.0039875628 0.033388901 0.966 0.034
D0609 0.014654471 0.0028010131 0.027577745 0.962 0.038
D0205 0.008600377 0.0020467751 0.015207831 0.953 0.047
D0207 0.031552241 0.0069001701 0.067737031 0.949 0.051
B0802 0.037714918 0.0086104696 0.071624679 0.947 0.053
D0304 0.014938085 0.0035430830 0.027059303 0.945 0.055
B0214 0.019893750 0.0026561423 0.037824383 0.931 0.069
B0217 0.013929202 0.0015844388 0.028027942 0.925 0.075
D0817 0.004336948 0.0003403666 0.008888889 0.919 0.081
#Negatively associated
p1%>%
  tt()
tinytable_s2pl0x65yyej8d8j9m5r
trait mean p1 p9 positive_support negative_support
B0208 -0.0344477317 -0.0618098460 -1.039294e-02 0.016 0.984
S0104 -0.0209740208 -0.0337504690 -8.006082e-03 0.022 0.978
D0805 -0.0004581602 -0.0009344457 -6.018417e-05 0.035 0.965
B0710 -0.0061948836 -0.0108430396 -1.870104e-03 0.037 0.963
B0310 -0.0028164249 -0.0048520436 -8.904523e-04 0.043 0.957
B1029 -0.0003677198 -0.0007876956 -4.280235e-05 0.045 0.955
D0604 -0.0252076749 -0.0512963980 -5.101516e-03 0.050 0.950
D0903 -0.0047866601 -0.0089646580 -1.108197e-03 0.050 0.950
B0218 -0.0116625901 -0.0223161257 -2.061981e-03 0.053 0.947
B0703 -0.0192156802 -0.0373477215 -2.861509e-03 0.059 0.941
D0206 -0.0197688096 -0.0392643553 -3.454538e-03 0.061 0.939
D0701 -0.0051397234 -0.0101283806 -6.650890e-04 0.068 0.932
D0509 -0.0151983776 -0.0301644940 -2.157025e-03 0.070 0.930
D0103 -0.0058062776 -0.0111387262 -5.611137e-04 0.073 0.927
B0903 -0.0040583726 -0.0082299820 -3.182127e-04 0.078 0.922
B0711 -0.0116387135 -0.0239457783 -1.087371e-03 0.079 0.921
D0307 -0.0106690266 -0.0219998777 -7.160822e-04 0.083 0.917
B0702 -0.0164229499 -0.0340972191 -1.406339e-03 0.085 0.915
D0508 -0.0025923862 -0.0054092788 -1.884571e-04 0.089 0.911
D0908 -0.0351466114 -0.0789647021 -1.049356e-03 0.091 0.909
D0912 -0.0009892400 -0.0020007917 -1.425600e-05 0.092 0.908
B0303 -0.0072114512 -0.0152002291 -7.612867e-05 0.097 0.903
#Positively associated
positive <- element_predictions %>%
  filter(mean >0) %>%
  arrange(mean) %>%
  filter(positive_support>=0.9) %>%
  select(-negative_support) %>%
  rename(support=positive_support)

#Negatively associated
negative <- element_predictions %>%
  filter(mean <0) %>%
  arrange(mean) %>%
  filter(negative_support>=0.9) %>%
  select(-positive_support) %>%
  rename(support=negative_support)

all_elements <- bind_rows(positive,negative) %>%
  left_join(GIFT_db,by=join_by(trait==Code_element)) %>%
  mutate(trait=factor(trait,levels=c(rev(positive$trait),rev(negative$trait)))) %>%
  mutate(Code_function=factor(Code_function)) %>%
  mutate(element_legend=str_c(trait," - ",Element)) %>%
  mutate(function_legend=str_c(Code_function," - ",Function)) %>%
  select(trait,mean,p1,p9,element_legend,function_legend) %>% 
  unique()

gift_colors <- read_tsv("data/gift_colors.tsv") %>% 
  mutate(legend=str_c(Code_function," - ",Function))  %>% 
  filter(legend %in% all_elements$function_legend)

all_elements %>%
  ggplot(aes(x=mean, y=fct_reorder(element_legend, mean), xmin=p1, xmax=p9, color=function_legend)) +
      geom_point() +
      geom_errorbar() +
      xlim(c(-0.1,0.09)) +
      geom_vline(xintercept=0) +
      scale_color_manual(values = gift_colors$Color) +
      theme_minimal() +
      labs(x="Regression coefficient",y="Functional trait")

community_elements %>%
    bind_rows() %>%
    pivot_longer(-elevation, names_to = "trait", values_to = "value") %>%
  filter(trait %in% positive$trait) %>%
  mutate(trait=factor(trait, levels=positive$trait)) %>%
    mutate(elevation=as.numeric(elevation)) %>%
    ggplot(aes(x=elevation, y=value)) +
          geom_smooth(method = lm, formula = y ~ x, se = TRUE) +
          #geom_smooth(method = lm, formula = y ~ splines::bs(x, 3), se = TRUE) +
          facet_wrap(~trait, ncol=5, scales="free") +
          theme_minimal() +
          labs(x="Elevation",y="Metabolic Capacity Index")

9.4.1.1 GIFT test visualization

# Aggregate bundle-level GIFTs into the compound level
genome_counts_filt_filt <- tibble::rownames_to_column(genome_counts_filt_filt, var = "genome")
GIFTs_elements_filtered <- elements_table[rownames(elements_table) %in% genome_counts_filt_filt$genome, ]
GIFTs_elements_filtered <- as.data.frame(GIFTs_elements_filtered) %>%
  select_if(~ !is.numeric(.) || sum(.) != 0)

# Get community-weighed average GIFTs per sample
GIFTs_elements_community <- to.community(GIFTs_elements_filtered, genome_counts_filt_filt %>% column_to_rownames(., "genome") %>% tss(), GIFT_db)

GIFTs_elements_community %>%
    as.data.frame() %>%
    rownames_to_column(var="sample") %>%
    pivot_longer(!sample,names_to="trait",values_to="gift") %>%
    left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
    mutate(functionid = substr(trait, 1, 3)) %>%
    mutate(trait = case_when(
      trait %in% GIFT_db$Code_element ~ GIFT_db$Element[match(trait, GIFT_db$Code_element)],
      TRUE ~ trait
    )) %>%
    mutate(functionid = case_when(
      functionid %in% GIFT_db$Code_function ~ GIFT_db$Function[match(functionid, GIFT_db$Code_function)],
      TRUE ~ functionid
    )) %>%
    mutate(trait=factor(trait,levels=unique(GIFT_db$Element))) %>%
    mutate(functionid=factor(functionid,levels=unique(GIFT_db$Function))) %>%
    ggplot(aes(x=sample,y=trait,fill=gift)) +
        geom_tile(colour="white", linewidth=0.2)+
        scale_fill_gradientn(colours=rev(c("#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#ddf1da")))+
        facet_grid(functionid ~ Elevation, scales="free",space="free") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
              strip.text.y = element_text(angle = 0)) +
        labs(y="Traits",x="Samples",fill="GIFT")

9.4.2 Functional level

functions_table <- elements_table %>%
    to.functions(., GIFT_db) %>%
    as.data.frame()

community_functions <- pred_elevation %>%
  group_by(elevation, genome) %>%
  mutate(row_id = row_number()) %>%
  pivot_wider(names_from = genome, values_from = value) %>%
  ungroup() %>%
  group_split(row_id) %>%
  as.list() %>%
  lapply(., FUN = function(x){x %>%
    select(-row_id) %>%
    column_to_rownames(var = "elevation") %>%
    as.data.frame() %>%
    exp() %>%
    t() %>%
    tss() %>%
    to.community(functions_table,.,GIFT_db) %>%
    as.data.frame() %>%
    rownames_to_column(var="elevation")
   })
#max-min option
calculate_slope <- function(x) {
  lm_fit <- lm(unlist(x) ~ seq_along(unlist(x)))
  coef(lm_fit)[2]
}

function_predictions <- map_dfc(community_functions, function(mat) {
      mat %>%
        column_to_rownames(var = "elevation") %>%
        t() %>%
        as.data.frame() %>%
        rowwise() %>%
        mutate(slope = calculate_slope(c_across(everything()))) %>%
        select(slope) }) %>%
      t() %>%
      as.data.frame() %>%
      set_names(colnames(community_functions[[1]])[-1]) %>%
      rownames_to_column(var="iteration") %>%
      pivot_longer(!iteration, names_to="trait",values_to="value") %>%
      group_by(trait) %>%
      summarise(mean=mean(value),
        p1 = quantile(value, probs = 0.1),
        p9 = quantile(value, probs = 0.9),
        positive_support = sum(value > 0)/1000,
        negative_support = sum(value < 0)/1000) %>%
      arrange(-positive_support)
# Positively associated
p2<-function_predictions %>%
  filter(mean >0) %>%
  arrange(-positive_support) %>%
  filter(positive_support>=0.9) %>%
  tt() |>
      style_tt(
        i = which(function_predictions$positive_support < 0.9 & function_predictions$negative_support < 0.1),
        background = "#E5D5B1") |>
      style_tt(
        i = which(function_predictions$negative_support > 0.9 & function_predictions$positive_support < 0.1),
        background = "#B7BCCE")

# Negatively associated (there isn't)
#function_predictions %>%
  #filter(mean <0) %>%
  #arrange(-negative_support) %>%
  #filter(negative_support>=0.9) %>%
  #tt() |>
      #style_tt(
        #i = which(function_predictions$positive_support < 0.9 & function_predictions$negative_support < 0.1),
        #background = "#E5D5B1") |>
      #style_tt(
        #i = which(function_predictions$negative_support > 0.9 & function_predictions$positive_support < 0.1),
        #background = "#B7BCCE")
# Positively associated
p2 %>%
  tt()
tinytable_qf9czk9hnevnxut661ae
trait mean p1 p9 positive_support negative_support
S03 0.026906176 0.0104593848 0.044524683 0.973 0.027
B04 0.018264988 0.0051591436 0.033137776 0.970 0.030
D03 0.010497822 0.0021459296 0.019986199 0.943 0.057
D06 0.003285998 0.0004532722 0.006338923 0.933 0.067
B07 0.012228797 0.0004053295 0.024691062 0.906 0.094
#Negatively associated (there isn't)
all_functions <- function_predictions %>%
  left_join(GIFT_db,by=join_by(trait==Code_function)) %>%
  mutate(trait=factor(trait)) %>%
  mutate(function_legend=str_c(trait," - ",Function)) %>%
  select(trait,mean,p1,p9,function_legend) %>% 
  unique()

gift_colors <- read_tsv("data/gift_colors.tsv") %>% 
  mutate(legend=str_c(Code_function," - ",Function))  %>% 
  filter(legend %in% all_functions$function_legend)

all_functions %>%
  ggplot(aes(x=mean, y=fct_reorder(function_legend, mean), xmin=p1, xmax=p9, color=function_legend)) +
      geom_point() +
      geom_errorbar() +
      xlim(c(-0.05,0.05)) +
      geom_vline(xintercept=0) +
      scale_color_manual(values = gift_colors$Color) +
      theme_minimal() +
      labs(x="Regression coefficient",y="Functional trait") +
      guides(col = guide_legend(ncol = 1))

community_functions %>%
    bind_rows() %>%
    pivot_longer(-elevation, names_to = "trait", values_to = "value") %>%
  filter(trait %in% function_predictions$trait) %>%
  mutate(trait=factor(trait, levels=function_predictions$trait)) %>%
    mutate(elevation=as.numeric(elevation)) %>%
    ggplot(aes(x=elevation, y=value)) +
          geom_smooth(method = lm, formula = y ~ x, se = TRUE) +
          #geom_smooth(method = lm, formula = y ~ splines::bs(x, 3), se = TRUE) +
          facet_wrap(~trait, ncol=5, scales="free") +
          theme_minimal() +
          labs(x="Elevation",y="Metabolic Capacity Index")

9.4.2.1 GIFT test visualization

# Aggregate element-level GIFTs into the function level
GIFTs_functions <- to.functions(GIFTs_elements_filtered, GIFT_db)
functions <- GIFTs_functions %>%
  as.data.frame()

# Get community-weighed average GIFTs per sample
GIFTs_functions_community <- to.community(GIFTs_functions, genome_counts_filt_filt %>% column_to_rownames(., "genome") %>% tss(), GIFT_db)

GIFTs_functions_community %>%
    as.data.frame() %>%
    rownames_to_column(var="sample") %>%
    pivot_longer(!sample,names_to="trait",values_to="gift") %>%
    left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
    ggplot(aes(x=trait,y=sample,fill=gift)) +
        geom_tile(colour="white", size=0.2)+
        scale_fill_gradientn(colours=rev(c("#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#ddf1da")))+
        facet_grid(Elevation ~ ., scales="free",space="free")

9.4.3 Domain level

domains_table <- functions_table %>%
    to.domains(., GIFT_db) %>%
    as.data.frame()

community_domains <- pred_elevation %>%
  group_by(elevation, genome) %>%
  mutate(row_id = row_number()) %>%
  pivot_wider(names_from = genome, values_from = value) %>%
  ungroup() %>%
  group_split(row_id) %>%
  as.list() %>%
  lapply(., FUN = function(x){x %>%
    select(-row_id) %>%
    column_to_rownames(var = "elevation") %>%
    as.data.frame() %>%
    exp() %>%
    t() %>%
    tss() %>%
    to.community(domains_table,.,GIFT_db) %>%
    as.data.frame() %>%
    rownames_to_column(var="elevation")
   })
#max-min option
calculate_slope <- function(x) {
  lm_fit <- lm(unlist(x) ~ seq_along(unlist(x)))
  coef(lm_fit)[2]
}

domain_predictions <- map_dfc(community_domains, function(mat) {
      mat %>%
        column_to_rownames(var = "elevation") %>%
        t() %>%
        as.data.frame() %>%
        rowwise() %>%
        mutate(slope = calculate_slope(c_across(everything()))) %>%
        select(slope) }) %>%
      t() %>%
      as.data.frame() %>%
      set_names(colnames(community_domains[[1]])[-1]) %>%
      rownames_to_column(var="iteration") %>%
      pivot_longer(!iteration, names_to="trait",values_to="value") %>%
      group_by(trait) %>%
      summarise(mean=mean(value),
        p1 = quantile(value, probs = 0.1),
        p9 = quantile(value, probs = 0.9),
        positive_support = sum(value > 0)/1000,
        negative_support = sum(value < 0)/1000) %>%
      arrange(-positive_support)
# Positively associated
p3<-positive_filtered<-domain_predictions %>%
  filter(mean >0) %>%
  arrange(-positive_support) %>%
  filter(positive_support>=0.9) 

if (nrow(positive_filtered) > 0) {
  positive_filtered %>%
  tt() |>
      style_tt(
        i = which(domain_predictions$positive_support < 0.9 & domain_predictions$negative_support < 0.1),
        background = "#E5D5B1") |>
      style_tt(
        i = which(domain_predictions$negative_support > 0.9 & domain_predictions$positive_support < 0.1),
        background = "#B7BCCE")
}

# Negatively associated (there isn't)
#domain_predictions %>%
  #filter(mean <0) %>%
  #arrange(-negative_support) %>%
  #filter(negative_support>=0.9) %>%
  #tt() |>
      #style_tt(
        #i = which(domain_predictions$positive_support < 0.9 & domain_predictions$negative_support < 0.1),
        #background = "#E5D5B1") |>
      #style_tt(
        #i = which(domain_predictions$negative_support > 0.9 & domain_predictions$positive_support < 0.1),
        #background = "#B7BCCE")
# Positively associated
p3 %>%
  tt()
tinytable_rndiyd15aqw2y1m0tlbt
trait mean p1 p9 positive_support negative_support
Overall 0.006863701 0.0003939157 0.01509651 0.917 0.083
# Negatively associated (there isn't)
all_domains <- domain_predictions %>%
  left_join(GIFT_db,by=join_by(trait==Code_function)) %>%
  mutate(trait=factor(trait)) %>%
  mutate(function_legend=str_c(trait," - ",Function)) %>%
  select(trait,mean,p1,p9) %>% 
  unique()

all_domains %>%
  ggplot(aes(x=mean, y=fct_reorder(trait, mean), xmin=p1, xmax=p9, color=trait)) +
      geom_point() +
      geom_errorbar() +
      xlim(c(-0.02,0.02)) +
      geom_vline(xintercept=0) +
      theme_minimal() +
      labs(x="Regression coefficient",y="Domain level") +
      guides(col = guide_legend(ncol = 1))

community_domains %>%
    bind_rows() %>%
    pivot_longer(-elevation, names_to = "trait", values_to = "value") %>%
  filter(trait %in% domain_predictions$trait) %>%
  mutate(trait=factor(trait, levels=domain_predictions$trait)) %>%
    mutate(elevation=as.numeric(elevation)) %>%
    ggplot(aes(x=elevation, y=value)) +
          geom_smooth(method = lm, formula = y ~ x, se = TRUE) +
          #geom_smooth(method = lm, formula = y ~ splines::bs(x, 3), se = TRUE) +
          facet_wrap(~trait, ncol=5, scales="free") +
          theme_minimal() +
          labs(x="Elevation",y="Metabolic Capacity Index")

9.4.3.1 GIFT test visualization

# Aggregate function-level GIFTs into overall Biosynthesis, Degradation and Structural GIFTs
GIFTs_domains <- to.domains(GIFTs_functions, GIFT_db)
domains <- GIFTs_domains %>%
  as.data.frame()

# Get community-weighed average GIFTs per sample
GIFTs_domains_community <- to.community(GIFTs_domains, genome_counts_filt_filt %>% column_to_rownames(., "genome") %>% tss(), GIFT_db)

GIFTs_domains_community %>%
    as.data.frame() %>%
    rownames_to_column(var="sample") %>%
    pivot_longer(!sample,names_to="trait",values_to="gift") %>%
    left_join(sample_metadata, by = join_by(sample == EHI_number)) %>%
    ggplot(aes(x=trait,y=sample,fill=gift)) +
        geom_tile(colour="white", size=0.2)+
        scale_fill_gradientn(colours=rev(c("#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#ddf1da")))+
        facet_grid(Elevation ~ ., scales="free",space="free")


  1. University of Copenhagen, ↩︎

  2. University of Copenhagen, ↩︎

  3. University of Valencia, ↩︎

  4. Centre National de la Recherche Scientifique, ↩︎

  5. University of Valencia and Universidade do Porto, ↩︎

  6. , ↩︎

  7. , ↩︎

  8. University of Lund, ↩︎

  9. University of Copenhagen, ↩︎

  10. University of Copenhagen, ↩︎